kmer_analysis

Load packages

First we will load in the packages needed to execute this analysis

library(dplyr)
library(readr)
library(tidyr)
library(stringr)
library(magrittr)
library(ggplot2)
library(viridis)
library(ggfortify)
library(PCAtools)
library(edgeR)
library(AnnotationHub)
library(ensembldb)
library(pheatmap)
library(cowplot)
library(ggbeeswarm)
library(tibble)
library(EnsDb.Hsapiens.v86)
library(GenomicFeatures)
library(here)

Load Custom Functions

source(here("R/cor_funcs_variance.R"))

Some methods will be called “hisat2”, this just means that bootstrapping estimates have not been taken into account. Nothing super special going on.

I have also defined several functions here for quality of life purposes

# From 01_Annotations
gene_txp_anno <- read_csv(here("data/annotations/grch38_103_df.csv.gz"))

transcript_key <- gene_txp_anno %>%
  dplyr::filter(type == "transcript") %>%
  dplyr::select(transcript_id,
                transcript_name)

txp_gene_ensdb_lengths <- read_csv(
  here("data/annotations/txp_gene_ensdb_lengths.csv.gz")
  )

transcript_key <- dplyr::select(txp_gene_ensdb_lengths,
                                c("transcript_id" = tx_id,
                                  "transcript_name" = tx_name))

# From 02_dtelist_prep
hisat2_dtelist <- read_rds(here("data/dtelists/hisat2_dtelist.rds"))
kallisto_dtelist <- read_rds(here("data/dtelists/kallisto_dtelist.rds"))
sa_dtelist <- read_rds(here("data/dtelists/sa_dtelist.rds"))
star_dtelist <- read_rds(here("data/dtelists/star_dtelist.rds"))

# From 04_principal_component_analysis
hisat2_loadings <- read_csv(
  here("data/pca/loadings/hisat2_loadings.csv")
  ) %>%
  head(250)

kallisto_loadings <- read_csv(
  here("data/pca/loadings/kallisto_loadings.csv")
  ) %>%
  head(275)

salmon_loadings <- read_csv(
  here("data/pca/loadings/salmon_loadings.csv")
  ) %>%
  head(300)

star_loadings <- read_csv(
  here("data/pca/loadings/star_loadings.csv")
  ) %>%
  head(275)

# From figure2 make
hisat2_unique <- read_csv(here("data/unique_transcripts/hisat2_unique.csv.gz"))
kallisto_unique <- read_csv(
  here("data/unique_transcripts/kallisto_unique.csv.gz")
  )
salmon_unique <- read_csv(here("data/unique_transcripts/salmon_unique.csv.gz"))
star_unique <- read_csv(here("data/unique_transcripts/star_unique.csv.gz"))

Get Combined Data

joined_df <- get_joined_samples(w = hisat2_dtelist,
                                x = kallisto_dtelist,
                                y = star_dtelist,
                                z = sa_dtelist,
                                method_w = "HISAT2",
                                method_x = "Kallisto",
                                method_y = "STAR",
                                method_z = "Salmon",
                                sample = 1)

cor_df <- joined_df
cor_df$variance <- apply(cor_df[,-1], 1, var)
whole_cor <- cor_df %>%
  dplyr::left_join(gene_txp_anno,
                   by = "transcript_id")
lowcor_ensdb <- cor_df %>%
  dplyr::arrange(desc(variance)) %>%
  head(250) %>%
  dplyr::select(transcript_id) %>%
  dplyr::left_join(gene_txp_anno,
                   by = "transcript_id")

lowcor_ensdb <- lowcor_ensdb %>%
  dplyr::select(-score,
                -phase) %>%
  select_if(~ !all(is.na(.))) %>%
  dplyr::select(-ccds_id,
                -transcript_support_level,
                -tag) %>%
  drop_na()
highcor_ensdb <- cor_df %>%
  dplyr::arrange(variance) %>%
  head(250) %>%
  dplyr::select(transcript_id) %>%
  dplyr::left_join(gene_txp_anno,
                   by = "transcript_id")

highcor_ensdb <- highcor_ensdb %>%
  dplyr::select(-score,
                -phase) %>%
  select_if(~ !all(is.na(.))) %>%
  dplyr::select(-ccds_id,
                -transcript_support_level,
                -tag) %>%
  drop_na()

Create sequences object

edb <- EnsDb.Hsapiens.v86
dna <- ensembldb:::getGenomeTwoBitFile(edb)

Low Correlation Transcripts K-mer Analysis

yTx_lowcor <- exonsBy(edb, filter = TxIdFilter(lowcor_ensdb$transcript_id))
yTxSeqs_lowcor <- extractTranscriptSeqs(dna, yTx_lowcor)

lowcor_df <- yTxSeqs_lowcor %>%
  as.data.frame() %>%
  rownames_to_column("transcript_id") %>%
  mutate(transcript_id = paste0(">", transcript_id)) %>%
  set_colnames(c("transcript_id", "sequence")) %>%
  as_tibble()

lowcor_df %>%
  head(n = 200) %>%
  write_delim(here("data/kmer_analysis/correlation/lowcor/lowcor_seqs.txt"),
              delim = "\n",
              eol = "\n\n",
              col_names = FALSE)
loc=/Users/konstantinosbogias/Documents/PhD/R_Projects/MethodsChapter/data/kmer_analysis/correlation

cp ${loc}/lowcor/lowcor_seqs.txt ${loc}/lowcor/lowcor_seqs.fa
for i in {1..100}; do
    jellyfish count -m ${i} -s 100M --output ${loc}/lowcor/lowcor_mer_counts.jf -C ${loc}/lowcor/lowcor_seqs.fa
    jellyfish histo ${loc}/lowcor/lowcor_mer_counts.jf > ${loc}/lowcor/histos/lowcor_${i}_mer_counts_histo.txt
    jellyfish stats ${loc}/lowcor/lowcor_mer_counts.jf > ${loc}/lowcor/stats/lowcor_${i}_mer_counts_stats.txt
    echo "lowcor ${i}_mer done"
done

Kmer data

lowcor_lengths <- txp_gene_ensdb_lengths %>%
  dplyr::filter(tx_id %in% lowcor_ensdb$transcript_id)

lowcor_lengths$transcript_length %>% summary()
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     180    1344    2475    3287    4078   27019
lowcor_mer_hist <- data.frame()
for(file in list.files(here("data/kmer_analysis/correlation/lowcor/histos"))) {
  lowcor_mer_add <- read_delim(
    paste0(here("data/kmer_analysis/correlation/lowcor/histos/"), file),
    col_names = FALSE,
    delim = " ") %>%
    set_colnames(c("Repeats", "Frequency")) %>%
    mutate(kmer_length = file %>%
             str_extract("...mer") %>%
             str_remove("_") %>% 
             str_remove("_") %>% 
             str_remove("mer") %>%
             as.numeric())
  
  lowcor_mer_hist <- rbind(lowcor_mer_hist, lowcor_mer_add)
}

lowcor_mer_hist <- lowcor_mer_hist %>%
  mutate(group = "lowcor")

write_csv(lowcor_mer_hist,
          here("data/kmer_analysis/correlation/lowcor/lowcor_mer_hist.csv"))

lowcor_mer_stat <- data.frame()
for(file in list.files(here("data/kmer_analysis/correlation/lowcor/stats"))) {
  lowcor_stat_add <- read_delim(
    paste0(here("data/kmer_analysis/correlation/lowcor/stats/"), file),
    col_names = FALSE,
    delim = " ") %>% 
    as.data.frame() %>%
    column_to_rownames("X1") %>%
    mutate_if(is.character, as.numeric) %>%
    mutate(Frequency = coalesce(X2, X3, X4, X5)) %>%
    rownames_to_column("Type") %>%
    dplyr::select("Type", "Frequency") %>%
    mutate(Type = str_remove(Type, ":")) %>% 
    dplyr::mutate(kmer_length = file %>%
             str_extract("...mer") %>%
             str_remove("_") %>% 
             str_remove("_") %>% 
             str_remove("mer") %>%
                    as.numeric())
  
  lowcor_mer_stat <- rbind(lowcor_mer_stat, lowcor_stat_add)
}

lowcor_mer_stat <- lowcor_mer_stat %>%
  mutate(group = "lowcor")

write_csv(lowcor_mer_stat,
          here("data/kmer_analysis/correlation/lowcor/lowcorcor_mer_stat.csv"))

Kmer plotting

Wanna try and find long genes with less distinct kmers, so low Distinct and Unique, with a high Total. Possibly a low Max count but a high Repeat number.

lowcor_lengths <- txp_gene_ensdb_lengths %>%
  dplyr::filter(tx_id %in% lowcor_ensdb$transcript_id) %>%
  dplyr::select(tx_id, transcript_length, seqnames, strand, tx_biotype) %>%
  dplyr::mutate(group = "lowcor")
lowcor_lengths$transcript_length %>% summary()
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     180    1344    2475    3287    4078   27019
# Summarise kmer lengths
lowcor_mer_hist$kmer_length %>% table() %>% as.data.frame()
##       . Freq
## 1     0   15
## 2     1    1
## 3     2    1
## 4     3    7
## 5     4  136
## 6     5  443
## 7     6  601
## 8     7  284
## 9     8  149
## 10    9  100
## 11   10  150
## 12   11  112
## 13   12   94
## 14   13   78
## 15   14   70
## 16   15   68
## 17   16   58
## 18   17   62
## 19   18   60
## 20   19   58
## 21   20   56
## 22   21   54
## 23   22   58
## 24   23   56
## 25   24   54
## 26   25   54
## 27   26   52
## 28   27   50
## 29   28   52
## 30   29   52
## 31   30   50
## 32   31   50
## 33   32   46
## 34   33   42
## 35   34   42
## 36   35   40
## 37   36   38
## 38   37   34
## 39   38   34
## 40   39   34
## 41   40   30
## 42   41   30
## 43   42   30
## 44   43   28
## 45   44   28
## 46   45   28
## 47   46   30
## 48   47   30
## 49   48   30
## 50   49   28
## 51   50   28
## 52   51   26
## 53   52   26
## 54   53   26
## 55   54   26
## 56   55   26
## 57   56   26
## 58   57   26
## 59   58   28
## 60   59   28
## 61   60   28
## 62   61   28
## 63   62   28
## 64   63   28
## 65   64   28
## 66   65   28
## 67   66   28
## 68   67   28
## 69   68   30
## 70   69   30
## 71   70   30
## 72   71   30
## 73   72   30
## 74   73   30
## 75   74   30
## 76   75   30
## 77   76   30
## 78   77   30
## 79   78   32
## 80   79   32
## 81   80   32
## 82   81   32
## 83   82   32
## 84   83   32
## 85   84   32
## 86   85   32
## 87   86   32
## 88   87   32
## 89   88   32
## 90   89   32
## 91   90   30
## 92   91   30
## 93   92   30
## 94   93   30
## 95   94   30
## 96   95   30
## 97   96   30
## 98   97   30
## 99   98   30
## 100  99   30
## 101 100   15
lowcor_mer_hist %>%
  dplyr::filter(kmer_length >= 31) %>%
  dplyr::mutate(Repeats = paste0("Repeat ", as.character(Repeats))) %>%
  ggplot(aes(x = kmer_length, y = Frequency, fill = as.character(Repeats))) +
  geom_area(fill = "royalblue",
            colour = "darkblue",
            linewidth = 1,
            alpha = 0.7) +
  facet_grid(~ as.character(Repeats),
             scales = "free") +
  labs(x = "K-mer Length (bp)",
       y = "Frequency",
       fill = "Kmer") +
  theme_bw() +
  theme(axis.text = element_text(colour = "black", size = 10),
        axis.title = element_text(colour = "black", size = 12),
        strip.text = element_text(colour = "black", size = 12),
        strip.background = element_rect(fill = "lightblue"))

lowcor_mer_hist %>%
  dplyr::filter(kmer_length <= 20) %>%
  dplyr::mutate(Repeats = paste0("Repeat ", as.character(Repeats))) %>%
  ggplot(aes(x = kmer_length, y = Frequency, fill = as.character(Repeats))) +
  geom_area(fill = "royalblue",
            colour = "darkblue",
            linewidth = 1,
            alpha = 0.7) +
  labs(x = "K-mer Length (bp)",
       y = "Frequency",
       fill = "Kmer") +
  theme_bw() +
  theme(axis.text = element_text(colour = "black", size = 10),
        axis.title = element_text(colour = "black", size = 12),
        strip.text = element_text(colour = "black", size = 12),
        strip.background = element_rect(fill = "lightblue"),
        legend.position = "none")

lowcor_mer_stat %>%
  dplyr::filter(!Type == "Max_count") %>%
  ggplot(aes(x = kmer_length, y = Frequency, fill = Type)) +
  geom_area(fill = "royalblue",
            colour = "darkblue",
            linewidth = 1,
            alpha = 0.7) +
  facet_grid(~ Type) +
  theme_bw()

High Correlation Transcripts K-mer Analysis

yTx_highcor <- exonsBy(edb, filter = TxIdFilter(highcor_ensdb$transcript_id))
yTxSeqs_highcor <- extractTranscriptSeqs(dna, yTx_highcor)

highcor_df <- yTxSeqs_highcor %>%
  as.data.frame() %>%
  rownames_to_column("transcript_id") %>%
  mutate(transcript_id = paste0(">", transcript_id)) %>%
  set_colnames(c("transcript_id", "sequence")) %>%
  as_tibble()

highcor_df %>%
  head(200) %>%
  write_delim(here("data/kmer_analysis/correlation/highcor/highcor_seqs.txt"),
            delim = "\n",
            eol = "\n\n",
            col_names = FALSE)
loc=/Users/konstantinosbogias/Documents/PhD/R_Projects/MethodsChapter/data/kmer_analysis/correlation

cp ${loc}/highcor/highcor_seqs.txt ${loc}/highcor/highcor_seqs.fa
for i in {1..100}; do
    jellyfish count -m ${i} -s 100M --output ${loc}/highcor/highcor_mer_counts.jf -C ${loc}/highcor/highcor_seqs.fa
    jellyfish histo ${loc}/highcor/highcor_mer_counts.jf > ${loc}/highcor/histos/highcor_${i}_mer_counts_histo.txt
    jellyfish stats ${loc}/highcor/highcor_mer_counts.jf > ${loc}/highcor/stats/highcor_${i}_mer_counts_stats.txt
    echo "highcor ${i}_mer done"
done

Kmer data

highcor_mer_hist <- data.frame()
for(file in list.files(here("data/kmer_analysis/correlation/highcor/histos"))) {
  highcor_mer_add <- read_delim(
    paste0(here("data/kmer_analysis/correlation/highcor/histos/"), file),
    col_names = FALSE,
    delim = " ") %>%
    set_colnames(c("Repeats", "Frequency")) %>%
    mutate(kmer_length = file %>%
             str_extract("...mer") %>%
             str_remove("_") %>% 
             str_remove("_") %>% 
             str_remove("mer") %>%
             as.numeric())
  
  highcor_mer_hist <- rbind(highcor_mer_hist, highcor_mer_add)
}

highcor_mer_hist <- highcor_mer_hist %>%
  mutate(group = "highcor")

write_csv(highcor_mer_hist,
          here("data/kmer_analysis/correlation/highcor/highcor_mer_hist.csv"))

highcor_mer_stat <- data.frame()
for(file in list.files(here("data/kmer_analysis/correlation/highcor/stats"))) {
  highcor_stat_add <- read_delim(
    paste0(here("data/kmer_analysis/correlation/highcor/stats/"), file),
    col_names = FALSE,
    delim = " ") %>% 
    as.data.frame() %>%
    column_to_rownames("X1") %>%
    mutate_if(is.character, as.numeric) %>%
    mutate(Frequency = coalesce(X2, X3, X4, X5)) %>%
    rownames_to_column("Type") %>%
    dplyr::select("Type", "Frequency") %>%
    mutate(Type = str_remove(Type, ":")) %>% 
    dplyr::mutate(kmer_length = file %>%
             str_extract("...mer") %>%
             str_remove("_") %>% 
             str_remove("_") %>% 
             str_remove("mer") %>%
                    as.numeric())
  
  highcor_mer_stat <- rbind(highcor_mer_stat, highcor_stat_add)
}

highcor_mer_stat <- highcor_mer_stat %>%
  mutate(group = "highcor")

write_csv(highcor_mer_stat,
          here("data/kmer_analysis/correlation/highcor/highcor_mer_stat.csv"))

Kmer plotting

Wanna try and find long genes with less distinct kmers, so low Distinct and Unique, with a high Total. Possibly a low Max count but a high Repeat number.

highcor_lengths <- txp_gene_ensdb_lengths %>%
  dplyr::filter(tx_id %in% highcor_ensdb$transcript_id) %>%
  dplyr::select(tx_id, transcript_length, seqnames, strand, tx_biotype) %>%
  dplyr::mutate(group = "highcor")
highcor_lengths$transcript_length %>% summary()
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     527    2125    3423    4057    4876   20635
# Summarise kmer lengths
highcor_mer_hist$kmer_length %>% table() %>% as.data.frame()
##       . Freq
## 1     0    2
## 2     1    1
## 3     2    1
## 4     3    3
## 5     4  130
## 6     5  469
## 7     6  757
## 8     7  357
## 9     8  170
## 10    9   99
## 11   10  138
## 12   11  110
## 13   12   92
## 14   13   80
## 15   14   70
## 16   15   68
## 17   16   62
## 18   17   52
## 19   18   50
## 20   19   48
## 21   20   46
## 22   21   44
## 23   22   40
## 24   23   38
## 25   24   34
## 26   25   34
## 27   26   30
## 28   27   30
## 29   28   26
## 30   29   26
## 31   30   22
## 32   31   22
## 33   32   22
## 34   33   22
## 35   34   20
## 36   35   22
## 37   36   20
## 38   37   18
## 39   38   16
## 40   39   16
## 41   40   14
## 42   41   16
## 43   42   14
## 44   43   14
## 45   44   14
## 46   45   14
## 47   46   14
## 48   47   14
## 49   48   14
## 50   49   14
## 51   50   12
## 52   51   14
## 53   52   12
## 54   53   12
## 55   54   10
## 56   55   10
## 57   56   10
## 58   57   10
## 59   58   10
## 60   59   10
## 61   60   10
## 62   61   10
## 63   62    8
## 64   63    8
## 65   64    8
## 66   65    6
## 67   66    6
## 68   67    6
## 69   68    6
## 70   69    6
## 71   70    6
## 72   71    6
## 73   72    6
## 74   73    6
## 75   74    6
## 76   75    6
## 77   76    6
## 78   77    6
## 79   78    6
## 80   79    6
## 81   80    6
## 82   81    6
## 83   82    6
## 84   83    6
## 85   84    6
## 86   85    6
## 87   86    6
## 88   87    6
## 89   88    6
## 90   89    6
## 91   90    6
## 92   91    6
## 93   92    6
## 94   93    6
## 95   94    6
## 96   95    6
## 97   96    6
## 98   97    4
## 99   98    4
## 100  99    4
## 101 100    2
highcor_mer_hist %>%
  dplyr::filter(kmer_length >= 31) %>%
  dplyr::mutate(Repeats = paste0("Repeat ", as.character(Repeats))) %>%
  ggplot(aes(x = kmer_length, y = Frequency, fill = as.character(Repeats))) +
  geom_area(fill = "royalblue",
            colour = "darkblue",
            linewidth = 1,
            alpha = 0.7) +
  facet_grid(~ as.character(Repeats),
             scales = "free") +
  labs(x = "K-mer Length (bp)",
       y = "Frequency",
       fill = "Kmer") +
  theme_bw() +
  theme(axis.text = element_text(colour = "black", size = 10),
        axis.title = element_text(colour = "black", size = 12),
        strip.text = element_text(colour = "black", size = 12),
        strip.background = element_rect(fill = "lightblue"))

highcor_mer_hist %>%
  dplyr::filter(kmer_length <= 20) %>%
  dplyr::mutate(Repeats = paste0("Repeat ", as.character(Repeats))) %>%
  ggplot(aes(x = kmer_length, y = Frequency, fill = as.character(Repeats))) +
  geom_area(fill = "royalblue",
            colour = "darkblue",
            linewidth = 1,
            alpha = 0.7) +
  labs(x = "K-mer Length (bp)",
       y = "Frequency",
       fill = "Kmer") +
  theme_bw() +
  theme(axis.text = element_text(colour = "black", size = 10),
        axis.title = element_text(colour = "black", size = 12),
        strip.text = element_text(colour = "black", size = 12),
        strip.background = element_rect(fill = "lightblue"),
        legend.position = "none")

highcor_mer_stat %>%
  dplyr::filter(!Type == "Max_count") %>%
  ggplot(aes(x = kmer_length, y = Frequency, fill = Type)) +
  geom_area(fill = "royalblue",
            colour = "darkblue",
            linewidth = 1,
            alpha = 0.7) +
  facet_grid(~ Type) +
  theme_bw()

K-mer Analysis by PCA Loadings

HISAT2

hs2_tx_remove <- txp_gene_ensdb_lengths %>%
  dplyr::filter(tx_id %in% hisat2_loadings$transcript_id) %>%
  dplyr::filter(str_detect(seqnames, "^CHR")) %>%
  as.data.frame()

hisat2_loadings <- hisat2_loadings %>% 
  dplyr::filter(!transcript_id %in% hs2_tx_remove$tx_id)

yTx_hisat2 <- exonsBy(edb,
                       filter = TxIdFilter(hisat2_loadings$transcript_id))

yTxSeqs_hisat2 <- extractTranscriptSeqs(dna, yTx_hisat2)

hisat2_kmer_df <- yTxSeqs_hisat2 %>%
  as.data.frame() %>%
  rownames_to_column("transcript_id") %>%
  mutate(transcript_id = paste0(">", transcript_id)) %>%
  set_colnames(c("transcript_id", "sequence")) %>%
  as_tibble()

hisat2_kmer_df %>%
  head(200) %>%
  write_delim(here("data/kmer_analysis/loadings/hisat2/hisat2_seqs.txt"),
              delim = "\n",
              eol = "\n\n",
              col_names = FALSE)
loc=/Users/konstantinosbogias/Documents/PhD/R_Projects/MethodsChapter/data/kmer_analysis/loadings

cp ${loc}/hisat2/hisat2_seqs.txt ${loc}/hisat2/hisat2_seqs.fa
for i in {1..100}; do
    jellyfish count -m ${i} -s 100M --output ${loc}/hisat2/hisat2_mer_counts.jf -C ${loc}/hisat2/hisat2_seqs.fa
    jellyfish histo ${loc}/hisat2/hisat2_mer_counts.jf > ${loc}/hisat2/histos/hisat2_${loc}_mer_counts_histo.txt
    jellyfish stats ${loc}/hisat2/hisat2_mer_counts.jf > ${loc}/hisat2/stats/hisat2_${i}_mer_counts_stats.txt
    echo "hisat2 ${i}_mer done"
done

Kmer data

hisat2_mer_hist <- data.frame()
for(file in list.files(here("data/kmer_analysis/loadings/hisat2/histos"))) {
  hisat2_mer_add <- read_delim(
    paste0(here("data/kmer_analysis/loadings/hisat2/histos/"), file),
    col_names = FALSE,
    delim = " ") %>%
    set_colnames(c("Repeats", "Frequency")) %>%
    mutate(kmer_length = file %>%
             str_extract("...mer") %>%
             str_remove("_") %>% 
             str_remove("_") %>% 
             str_remove("mer") %>%
             as.numeric())
  
  hisat2_mer_hist <- rbind(hisat2_mer_hist, hisat2_mer_add)
}

hisat2_mer_hist <- hisat2_mer_hist %>%
  mutate(group = "hisat2")

write_csv(hisat2_mer_hist,
          here("data/kmer_analysis/loadings/hisat2/hisat2_mer_hist.csv"))

hisat2_mer_stat <- data.frame()
for(file in list.files(here("data/kmer_analysis/loadings/hisat2/stats"))) {
  hisat2_stat_add <- read_delim(
    paste0(here("data/kmer_analysis/loadings/hisat2/stats/"), file),
    col_names = FALSE,
    delim = " ") %>% 
    as.data.frame() %>%
    column_to_rownames("X1") %>%
    mutate_if(is.character, as.numeric) %>%
    mutate(Frequency = coalesce(X2, X3, X4, X5)) %>%
    rownames_to_column("Type") %>%
    dplyr::select("Type", "Frequency") %>%
    mutate(Type = str_remove(Type, ":")) %>% 
    dplyr::mutate(kmer_length = file %>%
             str_extract("...mer") %>%
             str_remove("_") %>% 
             str_remove("_") %>% 
             str_remove("mer") %>%
                    as.numeric())
  
  hisat2_mer_stat <- rbind(hisat2_mer_stat, hisat2_stat_add)
}

hisat2_mer_stat <- hisat2_mer_stat %>%
  mutate(group = "hisat2")

write_csv(hisat2_mer_stat,
          here("data/kmer_analysis/loadings/hisat2/hisat2_mer_stat.csv"))

Kmer plotting

Wanna try and find long genes with less distinct kmers, so low Distinct and Unique, with a high Total. Possibly a low Max count but a high Repeat number.

hisat2_lengths <- txp_gene_ensdb_lengths %>%
  dplyr::filter(tx_id %in% hisat2_loadings$transcript_id) %>%
  dplyr::select(tx_id, transcript_length, seqnames, strand, tx_biotype) %>%
  dplyr::mutate(group = "hisat2")
hisat2_lengths$transcript_length %>% summary()
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     331    1326    2250    2976    3838   18760
# Summarise kmer lengths
hisat2_mer_hist$kmer_length %>% table() %>% as.data.frame()
##      . Freq
## 1    0   15
## 2    1    1
## 3    2    1
## 4    3    7
## 5    4  134
## 6    5  425
## 7    6  551
## 8    7  263
## 9    8  144
## 10   9   89
## 11  10   70
## 12  11   54
## 13  12   48
## 14  13   38
## 15  14   34
## 16  15   30
## 17  16   24
## 18  17   24
## 19  18   23
## 20  19   23
## 21  20   23
## 22  21   23
## 23  22   23
## 24  23   23
## 25  24   22
## 26  25   22
## 27  26   22
## 28  27   21
## 29  28   22
## 30  29   22
## 31  30   22
## 32  31   22
## 33  32   20
## 34  33   19
## 35  34   19
## 36  35   18
## 37  36   17
## 38  37   15
## 39  38   15
## 40  39   15
## 41  40   13
## 42  41   13
## 43  42   13
## 44  43   12
## 45  44   12
## 46  45   12
## 47  46   13
## 48  47   13
## 49  48   13
## 50  49   12
## 51  50   12
## 52  51   12
## 53  52   12
## 54  53   12
## 55  54   12
## 56  55   12
## 57  56   12
## 58  57   12
## 59  58   12
## 60  59   12
## 61  60   12
## 62  61   12
## 63  62   12
## 64  63   12
## 65  64   12
## 66  65   13
## 67  66   13
## 68  67   13
## 69  68   14
## 70  69   14
## 71  70   14
## 72  71   14
## 73  72   14
## 74  73   14
## 75  74   14
## 76  75   14
## 77  76   14
## 78  77   14
## 79  78   15
## 80  79   15
## 81  80   15
## 82  81   15
## 83  82   15
## 84  83   15
## 85  84   15
## 86  85   15
## 87  86   15
## 88  87   15
## 89  88   15
## 90  89   15
## 91  90   15
## 92  91   15
## 93  92   15
## 94  93   15
## 95  94   15
## 96  95   15
## 97  96   15
## 98  97   15
## 99  98   15
## 100 99   15
hisat2_mer_hist %>%
  dplyr::filter(kmer_length >= 31) %>%
  dplyr::mutate(Repeats = paste0("Repeat ", as.character(Repeats))) %>%
  ggplot(aes(x = kmer_length, y = Frequency, fill = as.character(Repeats))) +
  geom_area(fill = "royalblue",
            colour = "darkblue",
            linewidth = 1,
            alpha = 0.7) +
  facet_grid(~ as.character(Repeats),
             scales = "free") +
  labs(x = "K-mer Length (bp)",
       y = "Frequency",
       fill = "Kmer") +
  theme_bw() +
  theme(axis.text = element_text(colour = "black", size = 10),
        axis.title = element_text(colour = "black", size = 12),
        strip.text = element_text(colour = "black", size = 12),
        strip.background = element_rect(fill = "lightblue"))

hisat2_mer_hist %>%
  dplyr::filter(kmer_length <= 20) %>%
  dplyr::mutate(Repeats = paste0("Repeat ", as.character(Repeats))) %>%
  ggplot(aes(x = kmer_length, y = Frequency, fill = as.character(Repeats))) +
  geom_area(fill = "royalblue",
            colour = "darkblue",
            linewidth = 1,
            alpha = 0.7) +
  labs(x = "K-mer Length (bp)",
       y = "Frequency",
       fill = "Kmer") +
  theme_bw() +
  theme(axis.text = element_text(colour = "black", size = 10),
        axis.title = element_text(colour = "black", size = 12),
        strip.text = element_text(colour = "black", size = 12),
        strip.background = element_rect(fill = "lightblue"),
        legend.position = "none")

hisat2_mer_stat %>%
  dplyr::filter(!Type == "Max_count") %>%
  ggplot(aes(x = kmer_length, y = Frequency, fill = Type)) +
  geom_area(fill = "royalblue",
            colour = "darkblue",
            linewidth = 1,
            alpha = 0.7) +
  facet_grid(~ Type) +
  theme_bw()

Kallisto

k_tx_remove <- txp_gene_ensdb_lengths %>%
  dplyr::filter(tx_id %in% kallisto_loadings$transcript_id) %>%
  dplyr::filter(str_detect(seqnames, "^CHR")) %>%
  as.data.frame()

kallisto_loadings <- kallisto_loadings %>%
  dplyr::filter(transcript_id %in% gene_txp_anno$transcript_id)

yTx_kallisto <- exonsBy(edb, 
                        filter = TxIdFilter(kallisto_loadings$transcript_id))

yTxSeqs_kallisto <- extractTranscriptSeqs(dna, yTx_kallisto)

kallisto_kmer_df <- yTxSeqs_kallisto %>%
  as.data.frame() %>%
  rownames_to_column("transcript_id") %>%
  mutate(transcript_id = paste0(">", transcript_id)) %>%
  set_colnames(c("transcript_id", "sequence")) %>%
  as_tibble()

kallisto_kmer_df %>%
  head(200) %>%
  write_delim(here("data/kmer_analysis/loadings/kallisto/kallisto_seqs.txt"),
              delim = "\n",
              eol = "\n\n",
              col_names = FALSE)
loc=/Users/konstantinosbogias/Documents/PhD/R_Projects/MethodsChapter/data/kmer_analysis/loadings

cp ${loc}/kallisto/kallisto_seqs.txt ${loc}/kallisto/kallisto_seqs.fa
for i in {1..100}; do
    jellyfish count -m ${i} -s 100M --output ${loc}/kallisto/kallisto_mer_counts.jf -C ${loc}/kallisto/kallisto_seqs.fa
    jellyfish histo ${loc}/kallisto/kallisto_mer_counts.jf > ${loc}/kallisto/histos/kallisto_${i}_mer_counts_histo.txt
    jellyfish stats ${loc}/kallisto/kallisto_mer_counts.jf > ${loc}/kallisto/stats/kallisto_${i}_mer_counts_stats.txt
    echo "kallisto ${i}_mer done"
done

Kmer data

kallisto_mer_hist <- data.frame()
for(file in list.files(here("data/kmer_analysis/loadings/kallisto/histos"))) {
  kallisto_mer_add <- read_delim(
    paste0(here("data/kmer_analysis/loadings/kallisto/histos/"), file),
    col_names = FALSE,
    delim = " ") %>%
    set_colnames(c("Repeats", "Frequency")) %>%
    mutate(kmer_length = file %>%
             str_extract("...mer") %>%
             str_remove("_") %>% 
             str_remove("_") %>% 
             str_remove("mer") %>%
             as.numeric())
  
  kallisto_mer_hist <- rbind(kallisto_mer_hist, kallisto_mer_add)
}

kallisto_mer_hist <- kallisto_mer_hist %>%
  mutate(group = "kallisto")

write_csv(kallisto_mer_hist,
          here("data/kmer_analysis/loadings/kallisto/kallisto_mer_hist.csv"))

kallisto_mer_stat <- data.frame()
for(file in list.files(here("data/kmer_analysis/loadings/kallisto/stats"))) {
  kallisto_stat_add <- read_delim(
    paste0(here("data/kmer_analysis/loadings/kallisto/stats/"), file),
    col_names = FALSE,
    delim = " ") %>% 
    as.data.frame() %>%
    column_to_rownames("X1") %>%
    mutate_if(is.character, as.numeric) %>%
    mutate(Frequency = coalesce(X2, X3, X4, X5)) %>%
    rownames_to_column("Type") %>%
    dplyr::select("Type", "Frequency") %>%
    mutate(Type = str_remove(Type, ":")) %>% 
    dplyr::mutate(kmer_length = file %>%
             str_extract("...mer") %>%
             str_remove("_") %>% 
             str_remove("_") %>% 
             str_remove("mer") %>%
                    as.numeric())
  
  kallisto_mer_stat <- rbind(kallisto_mer_stat, kallisto_stat_add)
}

kallisto_mer_stat <- kallisto_mer_stat %>%
  mutate(group = "kallisto")

write_csv(kallisto_mer_stat,
          here("data/kmer_analysis/loadings/kallisto/kallisto_mer_stat.csv"))

Kmer plotting

Wanna try and find long genes with less distinct kmers, so low Distinct and Unique, with a high Total. Possibly a low Max count but a high Repeat number.

kallisto_lengths <- txp_gene_ensdb_lengths %>%
  dplyr::filter(tx_id %in% kallisto_loadings$transcript_id) %>%
  dplyr::select(tx_id, transcript_length, seqnames, strand, tx_biotype) %>%
  dplyr::mutate(group = "kallisto")
kallisto_lengths$transcript_length %>% summary()
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     164    1344    2540    3944    3912  205012
# Summarise kmer lengths
kallisto_mer_hist$kmer_length %>% table() %>% as.data.frame()
##      . Freq
## 1    0   24
## 2    1    1
## 3    2    1
## 4    3    5
## 5    4  136
## 6    5  459
## 7    6  631
## 8    7  327
## 9    8  200
## 10   9  137
## 11  10  101
## 12  11   78
## 13  12   66
## 14  13   55
## 15  14   49
## 16  15   47
## 17  16   44
## 18  17   40
## 19  18   42
## 20  19   42
## 21  20   40
## 22  21   40
## 23  22   37
## 24  23   37
## 25  24   35
## 26  25   35
## 27  26   35
## 28  27   34
## 29  28   35
## 30  29   34
## 31  30   34
## 32  31   34
## 33  32   30
## 34  33   29
## 35  34   28
## 36  35   27
## 37  36   27
## 38  37   26
## 39  38   25
## 40  39   25
## 41  40   23
## 42  41   23
## 43  42   22
## 44  43   21
## 45  44   21
## 46  45   21
## 47  46   22
## 48  47   22
## 49  48   22
## 50  49   21
## 51  50   21
## 52  51   21
## 53  52   21
## 54  53   21
## 55  54   21
## 56  55   21
## 57  56   22
## 58  57   22
## 59  58   22
## 60  59   22
## 61  60   22
## 62  61   22
## 63  62   22
## 64  63   21
## 65  64   21
## 66  65   21
## 67  66   21
## 68  67   21
## 69  68   21
## 70  69   19
## 71  70   20
## 72  71   20
## 73  72   20
## 74  73   20
## 75  74   20
## 76  75   20
## 77  76   20
## 78  77   20
## 79  78   21
## 80  79   21
## 81  80   21
## 82  81   21
## 83  82   22
## 84  83   22
## 85  84   22
## 86  85   22
## 87  86   21
## 88  87   22
## 89  88   22
## 90  89   22
## 91  90   22
## 92  91   22
## 93  92   22
## 94  93   22
## 95  94   22
## 96  95   23
## 97  96   23
## 98  97   23
## 99  98   23
## 100 99   23
kallisto_mer_hist %>%
  dplyr::filter(kmer_length >= 31) %>%
  dplyr::mutate(Repeats = paste0("Repeat ", as.character(Repeats))) %>%
  ggplot(aes(x = kmer_length, y = Frequency, fill = as.character(Repeats))) +
  geom_area(fill = "royalblue",
            colour = "darkblue",
            linewidth = 1,
            alpha = 0.7) +
  facet_grid(~ as.character(Repeats),
             scales = "free") +
  labs(x = "K-mer Length (bp)",
       y = "Frequency",
       fill = "Kmer") +
  theme_bw() +
  theme(axis.text = element_text(colour = "black", size = 10),
        axis.title = element_text(colour = "black", size = 12),
        strip.text = element_text(colour = "black", size = 12),
        strip.background = element_rect(fill = "lightblue"))

kallisto_mer_hist %>%
  dplyr::filter(kmer_length <= 20) %>%
  dplyr::mutate(Repeats = paste0("Repeat ", as.character(Repeats))) %>%
  ggplot(aes(x = kmer_length, y = Frequency, fill = as.character(Repeats))) +
  geom_area(fill = "royalblue",
            colour = "darkblue",
            linewidth = 1,
            alpha = 0.7) +
  labs(x = "K-mer Length (bp)",
       y = "Frequency",
       fill = "Kmer") +
  theme_bw() +
  theme(axis.text = element_text(colour = "black", size = 10),
        axis.title = element_text(colour = "black", size = 12),
        strip.text = element_text(colour = "black", size = 12),
        strip.background = element_rect(fill = "lightblue"),
        legend.position = "none")

kallisto_mer_stat %>%
  dplyr::filter(!Type == "Max_count") %>%
  ggplot(aes(x = kmer_length, y = Frequency, fill = Type)) +
  geom_area(fill = "royalblue",
            colour = "darkblue",
            linewidth = 1,
            alpha = 0.7) +
  facet_grid(~ Type) +
  theme_bw()

Salmon

sal_tx_remove <- txp_gene_ensdb_lengths %>%
  dplyr::filter(tx_id %in% salmon_loadings$transcript_id) %>%
  dplyr::filter(str_detect(seqnames, "^CHR")) %>%
  as.data.frame()

salmon_loadings <- salmon_loadings %>%
  dplyr::filter(!transcript_id %in% sal_tx_remove$tx_id)

yTx_salmon <- exonsBy(edb, filter = TxIdFilter(salmon_loadings$transcript_id))

yTxSeqs_salmon <- extractTranscriptSeqs(dna, yTx_salmon)

salmon_kmer_df <- yTxSeqs_salmon %>%
  as.data.frame() %>%
  rownames_to_column("transcript_id") %>%
  mutate(transcript_id = paste0(">", transcript_id)) %>%
  set_colnames(c("transcript_id", "sequence")) %>%
  as_tibble()

# Call head to make sure that we get 200 transcripts for each group!
salmon_kmer_df %>%
  head(200) %>%
  write_delim(here("data/kmer_analysis/loadings/salmon/salmon_seqs.txt"),
              delim = "\n",
              eol = "\n\n",
              col_names = FALSE)
loc=/Users/konstantinosbogias/Documents/PhD/R_Projects/MethodsChapter/data/kmer_analysis/loadings

cp ${loc}/salmon/salmon_seqs.txt ${loc}/salmon/salmon_seqs.fa
for i in {1..100}; do
    jellyfish count -m ${i} -s 100M --output ${loc}/salmon/salmon_mer_counts.jf -C ${loc}/salmon/salmon_seqs.fa
    jellyfish histo ${loc}/salmon/salmon_mer_counts.jf > ${loc}/salmon/histos/salmon_${i}_mer_counts_histo.txt
    jellyfish stats ${loc}/salmon/salmon_mer_counts.jf > ${loc}/salmon/stats/salmon_${i}_mer_counts_stats.txt
    echo "salmon ${i}_mer done"
done

Kmer data

salmon_mer_hist <- data.frame()
for(file in list.files(here("data/kmer_analysis/loadings/salmon/histos"))) {
  salmon_mer_add <- read_delim(
    paste0(here("data/kmer_analysis/loadings/salmon/histos/"), file),
    col_names = FALSE,
    delim = " ") %>%
    set_colnames(c("Repeats", "Frequency")) %>%
    mutate(kmer_length = file %>%
             str_extract("...mer") %>%
             str_remove("_") %>% 
             str_remove("_") %>% 
             str_remove("mer") %>%
             as.numeric())
  
  salmon_mer_hist <- rbind(salmon_mer_hist, salmon_mer_add)
}

salmon_mer_hist <- salmon_mer_hist %>%
  mutate(group = "salmon")

write_csv(salmon_mer_hist,
          here("data/kmer_analysis/loadings/salmon/salmon_mer_hist.csv"))

salmon_mer_stat <- data.frame()
for(file in list.files(here("data/kmer_analysis/loadings/salmon/stats"))) {
  salmon_stat_add <- read_delim(
    paste0(here("data/kmer_analysis/loadings/salmon/stats/"), file),
    col_names = FALSE,
    delim = " ") %>% 
    as.data.frame() %>%
    column_to_rownames("X1") %>%
    mutate_if(is.character, as.numeric) %>%
    mutate(Frequency = coalesce(X2, X3, X4, X5)) %>%
    rownames_to_column("Type") %>%
    dplyr::select("Type", "Frequency") %>%
    mutate(Type = str_remove(Type, ":")) %>% 
    dplyr::mutate(kmer_length = file %>%
             str_extract("...mer") %>%
             str_remove("_") %>% 
             str_remove("_") %>% 
             str_remove("mer") %>%
                    as.numeric())
  
  salmon_mer_stat <- rbind(salmon_mer_stat, salmon_stat_add)
}

salmon_mer_stat <- salmon_mer_stat %>%
  mutate(group = "salmon")

write_csv(salmon_mer_stat,
          here("data/kmer_analysis/loadings/salmon/salmon_mer_stat.csv"))

Kmer plotting

Wanna try and find long genes with less distinct kmers, so low Distinct and Unique, with a high Total. Possibly a low Max count but a high Repeat number.

salmon_lengths <- txp_gene_ensdb_lengths %>%
  dplyr::filter(tx_id %in% salmon_loadings$transcript_id) %>%
  dplyr::select(tx_id, transcript_length, seqnames, strand, tx_biotype) %>%
  dplyr::mutate(group = "salmon")
salmon_lengths$transcript_length %>% summary()
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     153    1813    2955    3732    4681   27019
# Summarise kmer lengths
salmon_mer_hist$kmer_length %>% table() %>% as.data.frame()
##      . Freq
## 1    0    5
## 2    1    1
## 3    2    1
## 4    3    7
## 5    4  135
## 6    5  431
## 7    6  551
## 8    7  256
## 9    8  118
## 10   9   68
## 11  10   48
## 12  11   36
## 13  12   29
## 14  13   26
## 15  14   25
## 16  15   21
## 17  16   21
## 18  17   18
## 19  18   18
## 20  19   17
## 21  20   17
## 22  21   14
## 23  22   12
## 24  23   12
## 25  24   11
## 26  25   10
## 27  26   10
## 28  27    9
## 29  28    9
## 30  29    8
## 31  30    8
## 32  31    7
## 33  32    7
## 34  33    6
## 35  34    6
## 36  35    6
## 37  36    6
## 38  37    6
## 39  38    6
## 40  39    6
## 41  40    6
## 42  41    6
## 43  42    6
## 44  43    6
## 45  44    5
## 46  45    6
## 47  46    5
## 48  47    6
## 49  48    5
## 50  49    5
## 51  50    5
## 52  51    5
## 53  52    5
## 54  53    5
## 55  54    5
## 56  55    5
## 57  56    5
## 58  57    5
## 59  58    5
## 60  59    5
## 61  60    5
## 62  61    5
## 63  62    5
## 64  63    5
## 65  64    5
## 66  65    5
## 67  66    5
## 68  67    5
## 69  68    5
## 70  69    5
## 71  70    5
## 72  71    5
## 73  72    5
## 74  73    5
## 75  74    5
## 76  75    5
## 77  76    5
## 78  77    5
## 79  78    5
## 80  79    5
## 81  80    5
## 82  81    5
## 83  82    5
## 84  83    5
## 85  84    5
## 86  85    5
## 87  86    5
## 88  87    5
## 89  88    5
## 90  89    5
## 91  90    5
## 92  91    5
## 93  92    5
## 94  93    5
## 95  94    5
## 96  95    5
## 97  96    5
## 98  97    5
## 99  98    5
## 100 99    5
salmon_mer_hist %>%
  dplyr::filter(kmer_length >= 31) %>%
  dplyr::mutate(Repeats = paste0("Repeat ", as.character(Repeats))) %>%
  ggplot(aes(x = kmer_length, y = Frequency, fill = as.character(Repeats))) +
  geom_area(fill = "royalblue",
            colour = "darkblue",
            linewidth = 1,
            alpha = 0.7) +
  facet_grid(~ as.character(Repeats),
             scales = "free") +
  labs(x = "K-mer Length (bp)",
       y = "Frequency",
       fill = "Kmer") +
  theme_bw() +
  theme(axis.text = element_text(colour = "black", size = 10),
        axis.title = element_text(colour = "black", size = 12),
        strip.text = element_text(colour = "black", size = 12),
        strip.background = element_rect(fill = "lightblue"))

salmon_mer_hist %>%
  dplyr::filter(kmer_length <= 20) %>%
  dplyr::mutate(Repeats = paste0("Repeat ", as.character(Repeats))) %>%
  ggplot(aes(x = kmer_length, y = Frequency, fill = as.character(Repeats))) +
  geom_area(fill = "royalblue",
            colour = "darkblue",
            linewidth = 1,
            alpha = 0.7) +
  labs(x = "K-mer Length (bp)",
       y = "Frequency",
       fill = "Kmer") +
  theme_bw() +
  theme(axis.text = element_text(colour = "black", size = 10),
        axis.title = element_text(colour = "black", size = 12),
        strip.text = element_text(colour = "black", size = 12),
        strip.background = element_rect(fill = "lightblue"),
        legend.position = "none")

salmon_mer_stat %>%
  dplyr::filter(!Type == "Max_count") %>%
  ggplot(aes(x = kmer_length, y = Frequency, fill = Type)) +
  geom_area(fill = "royalblue",
            colour = "darkblue",
            linewidth = 1,
            alpha = 0.7) +
  facet_grid(~ Type) +
  theme_bw()

STAR

star_loadings <- star_loadings %>%
  dplyr::filter(transcript_id %in% gene_txp_anno$transcript_id)

yTx_sa <- exonsBy(edb, filter = TxIdFilter(star_loadings$transcript_id))

yTxSeqs_sa <- extractTranscriptSeqs(dna, yTx_sa)

star_kmer_df <- yTxSeqs_sa %>%
  as.data.frame() %>%
  rownames_to_column("transcript_id") %>%
  mutate(transcript_id = paste0(">", transcript_id)) %>%
  set_colnames(c("transcript_id", "sequence")) %>%
  as_tibble()

star_kmer_df %>%
  head(200) %>%
  write_delim(here("data/kmer_analysis/loadings/star/star_seqs.txt"),
            delim = "\n",
            eol = "\n\n",
            col_names = FALSE)
loc=/Users/konstantinosbogias/Documents/PhD/R_Projects/MethodsChapter/data/kmer_analysis/loadings

cp ${loc}/star/star_seqs.txt ${loc}/star/star_seqs.fa
for i in {1..100}; do
    jellyfish count -m ${i} -s 100M --output ${loc}/star/star_mer_counts.jf -C ${loc}/star/star_seqs.fa
    jellyfish histo ${loc}/star/star_mer_counts.jf > ${loc}/star/histos/star_${i}_mer_counts_histo.txt
    jellyfish stats ${loc}/star/star_mer_counts.jf > ${loc}/star/stats/star_${i}_mer_counts_stats.txt
    echo "sa ${i}_mer done"
done

Kmer data

star_mer_hist <- data.frame()
for(file in list.files(here("data/kmer_analysis/loadings/star/histos"))) {
  star_mer_add <- read_delim(
    paste0(here("data/kmer_analysis/loadings/star/histos/"), file),
    col_names = FALSE,
    delim = " ") %>%
    set_colnames(c("Repeats", "Frequency")) %>%
    mutate(kmer_length = file %>%
             str_extract("...mer") %>%
             str_remove("_") %>% 
             str_remove("_") %>% 
             str_remove("mer") %>%
             as.numeric())
  
  star_mer_hist <- rbind(star_mer_hist, star_mer_add)
}

star_mer_hist <- star_mer_hist %>%
  mutate(group = "star")

write_csv(star_mer_hist,
          here("data/kmer_analysis/loadings/star/star_mer_hist.csv"))

star_mer_stat <- data.frame()
for(file in list.files(here("data/kmer_analysis/loadings/star/stats"))) {
  star_stat_add <- read_delim(
    paste0(here("data/kmer_analysis/loadings/star/stats/"), file),
    col_names = FALSE,
    delim = " ") %>% 
    as.data.frame() %>%
    column_to_rownames("X1") %>%
    mutate_if(is.character, as.numeric) %>%
    mutate(Frequency = coalesce(X2, X3, X4, X5)) %>%
    rownames_to_column("Type") %>%
    dplyr::select("Type", "Frequency") %>%
    mutate(Type = str_remove(Type, ":")) %>% 
    dplyr::mutate(kmer_length = file %>%
             str_extract("...mer") %>%
             str_remove("_") %>% 
             str_remove("_") %>% 
             str_remove("mer") %>%
                    as.numeric())
  
  star_mer_stat <- rbind(star_mer_stat, star_stat_add)
}

star_mer_stat <- star_mer_stat %>%
  mutate(group = "star")

write_csv(star_mer_stat,
          here("data/kmer_analysis/loadings/star/star_mer_stat.csv"))

Kmer plotting

Wanna try and find long genes with less distinct kmers, so low Distinct and Unique, with a high Total. Possibly a low Max count but a high Repeat number.

star_lengths <- txp_gene_ensdb_lengths %>%
  dplyr::filter(tx_id %in% star_loadings$transcript_id) %>%
  dplyr::select(tx_id, transcript_length, seqnames, strand, tx_biotype) %>%
  dplyr::mutate(group = "star")
star_lengths$transcript_length %>% summary()
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     217    1112    2177    3021    3964   20345
# Summarise kmer lengths
star_mer_hist$kmer_length %>% table() %>% as.data.frame()
##      . Freq
## 1    0    5
## 2    1    1
## 3    2    1
## 4    3    8
## 5    4  134
## 6    5  447
## 7    6  538
## 8    7  250
## 9    8  119
## 10   9   66
## 11  10   47
## 12  11   34
## 13  12   29
## 14  13   26
## 15  14   23
## 16  15   20
## 17  16   20
## 18  17   18
## 19  18   18
## 20  19   16
## 21  20   17
## 22  21   14
## 23  22   12
## 24  23   13
## 25  24   12
## 26  25   10
## 27  26   10
## 28  27    9
## 29  28    9
## 30  29    8
## 31  30    8
## 32  31    7
## 33  32    7
## 34  33    6
## 35  34    6
## 36  35    6
## 37  36    6
## 38  37    6
## 39  38    6
## 40  39    6
## 41  40    6
## 42  41    6
## 43  42    6
## 44  43    6
## 45  44    5
## 46  45    6
## 47  46    5
## 48  47    6
## 49  48    5
## 50  49    5
## 51  50    5
## 52  51    5
## 53  52    5
## 54  53    5
## 55  54    5
## 56  55    5
## 57  56    5
## 58  57    5
## 59  58    5
## 60  59    5
## 61  60    5
## 62  61    5
## 63  62    5
## 64  63    5
## 65  64    5
## 66  65    5
## 67  66    5
## 68  67    5
## 69  68    5
## 70  69    5
## 71  70    5
## 72  71    5
## 73  72    5
## 74  73    5
## 75  74    5
## 76  75    5
## 77  76    5
## 78  77    5
## 79  78    5
## 80  79    5
## 81  80    5
## 82  81    5
## 83  82    5
## 84  83    5
## 85  84    5
## 86  85    5
## 87  86    5
## 88  87    5
## 89  88    5
## 90  89    5
## 91  90    5
## 92  91    5
## 93  92    5
## 94  93    5
## 95  94    5
## 96  95    5
## 97  96    5
## 98  97    5
## 99  98    5
## 100 99    5
star_mer_hist %>%
  dplyr::filter(kmer_length >= 31) %>%
  ggplot(aes(x = kmer_length, y = Repeats)) + 
  geom_area(fill = "royalblue",
            colour = "darkblue",
            linewidth = 1,
            alpha = 0.7)

star_mer_hist %>%
  dplyr::filter(kmer_length >= 31) %>%
  dplyr::mutate(Repeats = paste0("Repeat ", as.character(Repeats))) %>%
  ggplot(aes(x = kmer_length, y = Frequency, fill = as.character(Repeats))) +
  geom_area(fill = "royalblue",
            colour = "darkblue",
            linewidth = 1,
            alpha = 0.7) +
  facet_grid(~ as.character(Repeats),
             scales = "free") +
  labs(x = "K-mer Length (bp)",
       y = "Frequency",
       fill = "Kmer") +
  theme_bw() +
  theme(axis.text = element_text(colour = "black", size = 10),
        axis.title = element_text(colour = "black", size = 12),
        strip.text = element_text(colour = "black", size = 12),
        strip.background = element_rect(fill = "lightblue"))

star_mer_hist %>%
  dplyr::filter(kmer_length <= 20) %>%
  dplyr::mutate(Repeats = paste0("Repeat ", as.character(Repeats))) %>%
  ggplot(aes(x = kmer_length, y = Frequency, fill = as.character(Repeats))) +
  geom_area(fill = "royalblue",
            colour = "darkblue",
            linewidth = 1,
            alpha = 0.7) +
  labs(x = "K-mer Length (bp)",
       y = "Frequency",
       fill = "Kmer") +
  theme_bw() +
  theme(axis.text = element_text(colour = "black", size = 10),
        axis.title = element_text(colour = "black", size = 12),
        strip.text = element_text(colour = "black", size = 12),
        strip.background = element_rect(fill = "lightblue"),
        legend.position = "none")

star_mer_stat %>%
  dplyr::filter(!Type == "Max_count") %>%
  ggplot(aes(x = kmer_length, y = Frequency, fill = Type)) +
  geom_area(fill = "royalblue",
            colour = "darkblue",
            linewidth = 1,
            alpha = 0.7) +
  facet_grid(~ Type) +
  theme_bw()

Plotting

figure 5

lengths_df <- rbind(hisat2_lengths,
                    kallisto_lengths,
                    star_lengths,
                    salmon_lengths)

lengths_by_method <- lengths_df %>%
  dplyr::mutate(
    group = case_when(
      group == "kallisto" ~ "Kallisto",
      group == "star" ~ "STAR",
      group == "hisat2" ~ "HISAT2",
      group == "salmon" ~ "Salmon"
    ),
    group = factor(group, levels = c("Concordant Transcripts",
                                     "Discordant Transcripts",
                                     #"Uncorrelated",
                                     "Top Negative\nPC1 Loadings",
                                     "Top Negative\nPC2 Loadings",
                                     "Top Positive\nPC2 Loadings",
                                     "Top Negative\nPC5 Loadings"))) %>%
  ggplot(aes(x = group, y = transcript_length)) +
  geom_boxplot(fill = "lightblue") +
  scale_y_log10() +
  labs(x = "",
       y = "Transcript Length (log10)") +
  theme_bw()

lengths_by_method %>%
  ggsave(filename = "length_by_method_plot.png",
         path = here("figures/kmer_analysis/all_methods_plots/"),
         device = "png",
         height = 200,
         width = 200,
         units = "mm",
         dpi = 400)

txp_lengths_hist <- txp_gene_ensdb_lengths %>%
  dplyr::select(tx_id, transcript_length) %>%
  mutate(group = "All Transcripts") %>%
  rbind(lengths_df %>%
          dplyr::select(tx_id, transcript_length) %>%
          dplyr::mutate(group = "Top Loadings Transcripts")) %>%
  ggplot(aes(x = log10(transcript_length))) +
  geom_histogram(bins = 200) +
  scale_x_continuous(limits = c(1, 6),
                     breaks = c(1, 2, 3, 4, 5, 6)) +
  labs(x = "Transcript Length (log10)",
       y = "Frequency") +
  facet_grid(group ~ ., switch = "y", scales = "free") +
  theme_bw() +
  theme(axis.text = element_text(colour = "black", size = 12),
        axis.title = element_text(colour = "black", size = 14),
        strip.background = element_rect(fill = "lightblue"),
        strip.text = element_text(colour = "black", size = 14))

txp_lengths_hist %>%
  ggsave(filename = "length_plot.png",
         path = here("figures/kmer_analysis/all_methods_plots/"),
         device = "png",
         height = 150,
         width = 200,
         units = "mm",
         dpi = 400)

# We can see that there is stuff all variation in frequency within groups
stat_df <- rbind(#noncor_mer_stat,
                 lowcor_mer_stat,
                 highcor_mer_stat,
                 hisat2_mer_stat,
                 kallisto_mer_stat,
                 star_mer_stat,
                 salmon_mer_stat)

stat_df <- stat_df %>%
  dplyr::mutate(kmer_length = case_when(
    kmer_length == 0 ~ 100,
    .default = kmer_length
  ))

group_names <- stat_df$group %>% unique()
new_stat_df <- data.frame()
stat_group_chunk <- data.frame()
for(g in group_names) {
  for(i in 1:100) {
    stat_kmer_chunk <- stat_df %>%
      dplyr::filter(kmer_length == i) %>%
      dplyr::filter(group == g) %>%
      dplyr::select(Type, Frequency) %>%
      distinct() %>%
      column_to_rownames("Type") %>%
      t() %>%
      as_tibble() %>%
      mutate(Multiplicity = Total-Distinct) %>%
      t() %>%
      set_colnames("Frequency") %>%
      as.data.frame() %>%
      rownames_to_column("Type") %>%
      mutate(kmer_length = i, group = g)
    
    stat_group_chunk <- rbind(stat_group_chunk, stat_kmer_chunk)
  }
  new_stat_df <- rbind(new_stat_df, stat_group_chunk)
}


kmer_stat_plotting <- function(x, type, kmer = 31) {
  x %>%
    dplyr::filter(kmer_length >= kmer,
                  Type == type) %>%
    dplyr::mutate(
      group = case_when(
        group == "highcor" ~ "Concordant Transcripts",
        group == "lowcor" ~ "Discordant Transcripts",
        group == "hisat2" ~ "Top Negative\nPC1 Loadings",
        group == "kallisto" ~ "Top Negative\nPC2 Loadings",
        group == "star" ~ "Top Positive\nPC2 Loadings",
        group == "salmon" ~ "Top Negative\nPC5 Loadings"
      ),
      group = factor(group, levels = c("Concordant Transcripts",
                                       "Discordant Transcripts",
                                       "Top Negative\nPC1 Loadings",
                                       "Top Negative\nPC2 Loadings",
                                       "Top Positive\nPC2 Loadings",
                                       "Top Negative\nPC5 Loadings"))) %>%
    ggplot(aes(x = kmer_length, y = Frequency)) +
    geom_area(fill = "royalblue",
              colour = "darkblue",
              linewidth = 1,
              alpha = 0.7) +
    labs(x = "K-mer Length (bp)",
         y = "Frequency") +
    facet_grid(~ group) +
    theme_bw() +
    theme(strip.background = element_rect(fill = "lightblue"))
}

type_nms <- new_stat_df$Type %>% unique()

for(i in type_nms) {
  kmer_stat_plotting(new_stat_df, type = i) 
  
  ggsave(filename = paste0(i, "_kmer_length_freq.png"),
         path = here("figures/kmer_analysis/all_methods_plots/"),
         device = "png",
         height = 150,
         width = 300,
         units = "mm",
         dpi = 400)
}


hist_df <- rbind(lowcor_mer_hist,
                 highcor_mer_hist,
                 hisat2_mer_hist,
                 kallisto_mer_hist,
                 star_mer_hist,
                 salmon_mer_hist)

hist_df %>%
  dplyr::filter(kmer_length == 31) %>%
  dplyr::arrange(Repeats) %>%
  ggplot(aes(x = Repeats, y = Frequency)) +
  geom_area(fill = "royalblue",
            colour = "darkblue",
            linewidth = 1,
            alpha = 0.7) +
  facet_grid(~ group) +
  theme_bw()

kmer_numbers

new_stat_df %>%
  distinct() %>%
  dplyr::filter(Type == "Unique", kmer_length > 30) %>%
  dplyr:: filter(group == "highcor") %>%
  dplyr::select("Frequency") %>%
  summary()
##    Frequency     
##  Min.   :738599  
##  1st Qu.:741960  
##  Median :745325  
##  Mean   :745189  
##  3rd Qu.:748582  
##  Max.   :750831
new_stat_df %>%
  distinct() %>%
  dplyr::filter(Type == "Unique", kmer_length > 30) %>%
  dplyr:: filter(group == "lowcor") %>%
  dplyr::select("Frequency") %>%
  summary()
##    Frequency     
##  Min.   :494126  
##  1st Qu.:496865  
##  Median :499443  
##  Mean   :499336  
##  3rd Qu.:501892  
##  Max.   :504011
new_stat_df %>%
  distinct() %>%
  dplyr::filter(Type == "Unique", kmer_length > 30) %>%
  dplyr:: filter(group == "hisat2") %>%
  dplyr::select("Frequency") %>%
  summary()
##    Frequency     
##  Min.   :425694  
##  1st Qu.:427821  
##  Median :429744  
##  Mean   :429577  
##  3rd Qu.:431438  
##  Max.   :432724
new_stat_df %>%
  distinct() %>%
  dplyr::filter(Type == "Unique", kmer_length > 30) %>%
  dplyr:: filter(group == "kallisto") %>%
  dplyr::select("Frequency") %>%
  summary()
##    Frequency     
##  Min.   :506588  
##  1st Qu.:508948  
##  Median :510892  
##  Mean   :510585  
##  3rd Qu.:512473  
##  Max.   :513111
new_stat_df %>%
  distinct() %>%
  dplyr::filter(Type == "Unique", kmer_length > 30) %>%
  dplyr:: filter(group == "salmon") %>%
  dplyr::select("Frequency") %>%
  summary()
##    Frequency     
##  Min.   :429965  
##  1st Qu.:431162  
##  Median :432096  
##  Mean   :431848  
##  3rd Qu.:432664  
##  Max.   :432796
new_stat_df %>%
  distinct() %>%
  dplyr::filter(Type == "Unique", kmer_length > 30) %>%
  dplyr:: filter(group == "star") %>%
  dplyr::select("Frequency") %>%
  summary()
##    Frequency     
##  Min.   :417408  
##  1st Qu.:418605  
##  Median :419539  
##  Mean   :419261  
##  3rd Qu.:420043  
##  Max.   :420167
new_stat_df %>%
  distinct() %>%
  dplyr::filter(Type == "Multiplicity", kmer_length > 30) %>%
  dplyr:: filter(group == "highcor") %>%
  dplyr::select("Frequency") %>%
  summary()
##    Frequency     
##  Min.   :  58.0  
##  1st Qu.: 109.8  
##  Median : 161.5  
##  Mean   : 259.6  
##  3rd Qu.: 308.2  
##  Max.   :1106.0
new_stat_df %>%
  distinct() %>%
  dplyr::filter(Type == "Multiplicity", kmer_length > 30) %>%
  dplyr:: filter(group == "lowcor") %>%
  dplyr::select("Frequency") %>%
  summary()
##    Frequency    
##  Min.   :24463  
##  1st Qu.:24982  
##  Median :25598  
##  Mean   :25684  
##  3rd Qu.:26319  
##  Max.   :27337
new_stat_df %>%
  distinct() %>%
  dplyr::filter(Type == "Multiplicity", kmer_length > 30) %>%
  dplyr:: filter(group == "hisat2") %>%
  dplyr::select("Frequency") %>%
  summary()
##    Frequency    
##  Min.   :41457  
##  1st Qu.:42300  
##  Median :43260  
##  Mean   :43375  
##  3rd Qu.:44374  
##  Max.   :45830
new_stat_df %>%
  distinct() %>%
  dplyr::filter(Type == "Multiplicity", kmer_length > 30) %>%
  dplyr:: filter(group == "kallisto") %>%
  dplyr::select("Frequency") %>%
  summary()
##    Frequency    
##  Min.   :38474  
##  1st Qu.:39326  
##  Median :40414  
##  Mean   :40636  
##  3rd Qu.:41772  
##  Max.   :43845
new_stat_df %>%
  distinct() %>%
  dplyr::filter(Type == "Multiplicity", kmer_length > 30) %>%
  dplyr:: filter(group == "salmon") %>%
  dplyr::select("Frequency") %>%
  summary()
##    Frequency    
##  Min.   :23804  
##  1st Qu.:25163  
##  Median :26670  
##  Mean   :26832  
##  3rd Qu.:28365  
##  Max.   :30676
new_stat_df %>%
  distinct() %>%
  dplyr::filter(Type == "Multiplicity", kmer_length > 30) %>%
  dplyr:: filter(group == "star") %>%
  dplyr::select("Frequency") %>%
  summary()
##    Frequency    
##  Min.   :21210  
##  1st Qu.:22569  
##  Median :24076  
##  Mean   :24247  
##  3rd Qu.:25783  
##  Max.   :28149

K-mer Analysis by Unique

HISAT2

hs2_tx_remove <- txp_gene_ensdb_lengths %>%
  dplyr::filter(tx_id %in% hisat2_unique$transcript_id) %>%
  dplyr::filter(str_detect(seqnames, "^CHR")) %>%
  as.data.frame()

hisat2_unique <- hisat2_unique %>% 
  dplyr::filter(!transcript_id %in% hs2_tx_remove$tx_id)

yTx_hisat2 <- exonsBy(edb,
                       filter = TxIdFilter(hisat2_unique$transcript_id))

yTxSeqs_hisat2 <- extractTranscriptSeqs(dna, yTx_hisat2)

hisat2_kmer_df <- yTxSeqs_hisat2 %>%
  as.data.frame() %>%
  rownames_to_column("transcript_id") %>%
  mutate(transcript_id = paste0(">", transcript_id)) %>%
  set_colnames(c("transcript_id", "sequence")) %>%
  as_tibble()

hisat2_kmer_df %>%
  head(200) %>%
  write_delim(here("data/kmer_analysis/unique/hisat2/hisat2_seqs.txt"),
              delim = "\n",
              eol = "\n\n",
              col_names = FALSE)
#vol=/Volumes/Fast_T7/R_projects/MethodsChapter/data/sequence_analysis
loc=/Users/konstantinosbogias/Documents/PhD/R_Projects/MethodsChapter/data/kmer_analysis/unique

cp ${loc}/hisat2/hisat2_seqs.txt ${loc}/hisat2/hisat2_seqs.fa
for i in {1..100}; do
    jellyfish count -m ${i} -s 100M --output ${loc}/hisat2/hisat2_mer_counts.jf -C ${loc}/hisat2/hisat2_seqs.fa
    jellyfish histo ${loc}/hisat2/hisat2_mer_counts.jf > ${loc}/hisat2/histos/hisat2_${loc}_mer_counts_histo.txt
    #jellyfish dump ${loc}/hisat2/hisat2_mer_counts.jf > ${vol}/hisat2/dumps/hisat2_${i}_mer_counts_dump.fa
    jellyfish stats ${loc}/hisat2/hisat2_mer_counts.jf > ${loc}/hisat2/stats/hisat2_${i}_mer_counts_stats.txt
    echo "hisat2 ${i}_mer done"
done

Kmer data

hisat2_mer_hist <- data.frame()
for(file in list.files(here("data/kmer_analysis/unique/hisat2/histos"))) {
  hisat2_mer_add <- read_delim(
    paste0(here("data/kmer_analysis/unique/hisat2/histos/"), file),
    col_names = FALSE,
    delim = " ") %>%
    set_colnames(c("Repeats", "Frequency")) %>%
    mutate(kmer_length = file %>%
             str_extract("...mer") %>%
             str_remove("_") %>% 
             str_remove("_") %>% 
             str_remove("mer") %>%
             as.numeric())
  
  hisat2_mer_hist <- rbind(hisat2_mer_hist, hisat2_mer_add)
}

hisat2_mer_hist <- hisat2_mer_hist %>%
  mutate(group = "hisat2")

write_csv(hisat2_mer_hist,
          here("data/kmer_analysis/unique/hisat2/hisat2_mer_hist.csv"))

hisat2_mer_stat <- data.frame()
for(file in list.files(here("data/kmer_analysis/unique/hisat2/stats"))) {
  hisat2_stat_add <- read_delim(
    paste0(here("data/kmer_analysis/unique/hisat2/stats/"), file),
    col_names = FALSE,
    delim = " ") %>% 
    as.data.frame() %>%
    column_to_rownames("X1") %>%
    mutate_if(is.character, as.numeric) %>%
    mutate(Frequency = coalesce(X2, X3, X4, X5)) %>%
    rownames_to_column("Type") %>%
    dplyr::select("Type", "Frequency") %>%
    mutate(Type = str_remove(Type, ":")) %>% 
    dplyr::mutate(kmer_length = file %>%
             str_extract("...mer") %>%
             str_remove("_") %>% 
             str_remove("_") %>% 
             str_remove("mer") %>%
                    as.numeric())
  
  hisat2_mer_stat <- rbind(hisat2_mer_stat, hisat2_stat_add)
}

hisat2_mer_stat <- hisat2_mer_stat %>%
  mutate(group = "hisat2")

write_csv(hisat2_mer_stat,
          here("data/kmer_analysis/unique/hisat2/hisat2_mer_stat.csv"))

Kmer plotting

Wanna try and find long genes with less distinct kmers, so low Distinct and Unique, with a high Total. Possibly a low Max count but a high Repeat number.

hisat2_lengths <- txp_gene_ensdb_lengths %>%
  dplyr::filter(tx_id %in% hisat2_unique$transcript_id) %>%
  dplyr::select(tx_id, transcript_length, seqnames, strand, tx_biotype) %>%
  dplyr::mutate(group = "hisat2")
hisat2_lengths$transcript_length %>% summary()
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     539    2062    3372    3669    4908   17657
# Summarise kmer lengths
hisat2_mer_hist$kmer_length %>% table() %>% as.data.frame()
##      . Freq
## 1    0   11
## 2    1    1
## 3    2    1
## 4    3    3
## 5    4  128
## 6    5  467
## 7    6  738
## 8    7  358
## 9    8  177
## 10   9   93
## 11  10   58
## 12  11   40
## 13  12   32
## 14  13   29
## 15  14   27
## 16  15   23
## 17  16   19
## 18  17   20
## 19  18   17
## 20  19   16
## 21  20   16
## 22  21   14
## 23  22   14
## 24  23   15
## 25  24   14
## 26  25   14
## 27  26   14
## 28  27   14
## 29  28   14
## 30  29   14
## 31  30   13
## 32  31   12
## 33  32   12
## 34  33   13
## 35  34   13
## 36  35   12
## 37  36   11
## 38  37   12
## 39  38   12
## 40  39   12
## 41  40   12
## 42  41   12
## 43  42   11
## 44  43   11
## 45  44   11
## 46  45   11
## 47  46   11
## 48  47   11
## 49  48   11
## 50  49   11
## 51  50   11
## 52  51   11
## 53  52   11
## 54  53   11
## 55  54   11
## 56  55   11
## 57  56   11
## 58  57   11
## 59  58   11
## 60  59   11
## 61  60   11
## 62  61   11
## 63  62   11
## 64  63   11
## 65  64   11
## 66  65   11
## 67  66   11
## 68  67   11
## 69  68   11
## 70  69   11
## 71  70   11
## 72  71   11
## 73  72   11
## 74  73   11
## 75  74   11
## 76  75   11
## 77  76   11
## 78  77   11
## 79  78   11
## 80  79   11
## 81  80   11
## 82  81   11
## 83  82   11
## 84  83   11
## 85  84   11
## 86  85   11
## 87  86   11
## 88  87   11
## 89  88   11
## 90  89   11
## 91  90   11
## 92  91   11
## 93  92   11
## 94  93   11
## 95  94   11
## 96  95   11
## 97  96   11
## 98  97   11
## 99  98   11
## 100 99   11
hisat2_mer_hist %>%
  dplyr::filter(kmer_length >= 31) %>%
  dplyr::mutate(Repeats = paste0("Repeat ", as.character(Repeats))) %>%
  ggplot(aes(x = kmer_length, y = Frequency, fill = as.character(Repeats))) +
  geom_area(fill = "royalblue",
            colour = "darkblue",
            linewidth = 1,
            alpha = 0.7) +
  facet_grid(~ as.character(Repeats),
             scales = "free") +
  labs(x = "K-mer Length (bp)",
       y = "Frequency",
       fill = "Kmer") +
  theme_bw() +
  theme(axis.text = element_text(colour = "black", size = 10),
        axis.title = element_text(colour = "black", size = 12),
        strip.text = element_text(colour = "black", size = 12),
        strip.background = element_rect(fill = "lightblue"))

hisat2_mer_hist %>%
  dplyr::filter(kmer_length <= 20) %>%
  dplyr::mutate(Repeats = paste0("Repeat ", as.character(Repeats))) %>%
  ggplot(aes(x = kmer_length, y = Frequency, fill = as.character(Repeats))) +
  geom_area(fill = "royalblue",
            colour = "darkblue",
            linewidth = 1,
            alpha = 0.7) +
  labs(x = "K-mer Length (bp)",
       y = "Frequency",
       fill = "Kmer") +
  theme_bw() +
  theme(axis.text = element_text(colour = "black", size = 10),
        axis.title = element_text(colour = "black", size = 12),
        strip.text = element_text(colour = "black", size = 12),
        strip.background = element_rect(fill = "lightblue"),
        legend.position = "none")

hisat2_mer_stat %>%
  dplyr::filter(!Type == "Max_count") %>%
  ggplot(aes(x = kmer_length, y = Frequency, fill = Type)) +
  geom_area(fill = "royalblue",
            colour = "darkblue",
            linewidth = 1,
            alpha = 0.7) +
  facet_grid(~ Type) +
  theme_bw()

Kallisto

k_tx_remove <- txp_gene_ensdb_lengths %>%
  dplyr::filter(tx_id %in% kallisto_unique$transcript_id) %>%
  dplyr::filter(str_detect(seqnames, "^CHR")) %>%
  as.data.frame()

kallisto_unique <- kallisto_unique %>%
  dplyr::filter(transcript_id %in% gene_txp_anno$transcript_id)

yTx_kallisto <- exonsBy(edb, 
                        filter = TxIdFilter(kallisto_unique$transcript_id))

yTxSeqs_kallisto <- extractTranscriptSeqs(dna, yTx_kallisto)

kallisto_kmer_df <- yTxSeqs_kallisto %>%
  as.data.frame() %>%
  rownames_to_column("transcript_id") %>%
  mutate(transcript_id = paste0(">", transcript_id)) %>%
  set_colnames(c("transcript_id", "sequence")) %>%
  as_tibble()

kallisto_kmer_df %>%
  head(200) %>%
  write_delim(here("data/kmer_analysis/unique/kallisto/kallisto_seqs.txt"),
              delim = "\n",
              eol = "\n\n",
              col_names = FALSE)
#vol=/Volumes/Fast_T7/R_projects/MethodsChapter/data/sequence_analysis
loc=/Users/konstantinosbogias/Documents/PhD/R_Projects/MethodsChapter/data/kmer_analysis/unique

cp ${loc}/kallisto/kallisto_seqs.txt ${loc}/kallisto/kallisto_seqs.fa
for i in {1..100}; do
    jellyfish count -m ${i} -s 100M --output ${loc}/kallisto/kallisto_mer_counts.jf -C ${loc}/kallisto/kallisto_seqs.fa
    jellyfish histo ${loc}/kallisto/kallisto_mer_counts.jf > ${loc}/kallisto/histos/kallisto_${i}_mer_counts_histo.txt
    #jellyfish dump ${loc}/kallisto/kallisto_mer_counts.jf > ${vol}/kallisto/dumps/kallisto_${i}_mer_counts_dump.fa
    jellyfish stats ${loc}/kallisto/kallisto_mer_counts.jf > ${loc}/kallisto/stats/kallisto_${i}_mer_counts_stats.txt
    echo "kallisto ${i}_mer done"
done

Kmer data

kallisto_mer_hist <- data.frame()
for(file in list.files(here("data/kmer_analysis/unique/kallisto/histos"))) {
  kallisto_mer_add <- read_delim(
    paste0(here("data/kmer_analysis/unique/kallisto/histos/"), file),
    col_names = FALSE,
    delim = " ") %>%
    set_colnames(c("Repeats", "Frequency")) %>%
    mutate(kmer_length = file %>%
             str_extract("...mer") %>%
             str_remove("_") %>% 
             str_remove("_") %>% 
             str_remove("mer") %>%
             as.numeric())
  
  kallisto_mer_hist <- rbind(kallisto_mer_hist, kallisto_mer_add)
}

kallisto_mer_hist <- kallisto_mer_hist %>%
  mutate(group = "kallisto")

write_csv(kallisto_mer_hist,
          here("data/kmer_analysis/unique/kallisto/kallisto_mer_hist.csv"))

kallisto_mer_stat <- data.frame()
for(file in list.files(here("data/kmer_analysis/unique/kallisto/stats"))) {
  kallisto_stat_add <- read_delim(
    paste0(here("data/kmer_analysis/unique/kallisto/stats/"), file),
    col_names = FALSE,
    delim = " ") %>% 
    as.data.frame() %>%
    column_to_rownames("X1") %>%
    mutate_if(is.character, as.numeric) %>%
    mutate(Frequency = coalesce(X2, X3, X4, X5)) %>%
    rownames_to_column("Type") %>%
    dplyr::select("Type", "Frequency") %>%
    mutate(Type = str_remove(Type, ":")) %>% 
    dplyr::mutate(kmer_length = file %>%
             str_extract("...mer") %>%
             str_remove("_") %>% 
             str_remove("_") %>% 
             str_remove("mer") %>%
                    as.numeric())
  
  kallisto_mer_stat <- rbind(kallisto_mer_stat, kallisto_stat_add)
}

kallisto_mer_stat <- kallisto_mer_stat %>%
  mutate(group = "kallisto")

write_csv(kallisto_mer_stat,
          here("data/kmer_analysis/unique/kallisto/kallisto_mer_stat.csv"))

Kmer plotting

Wanna try and find long genes with less distinct kmers, so low Distinct and Unique, with a high Total. Possibly a low Max count but a high Repeat number.

kallisto_lengths <- txp_gene_ensdb_lengths %>%
  dplyr::filter(tx_id %in% kallisto_unique$transcript_id) %>%
  dplyr::select(tx_id, transcript_length, seqnames, strand, tx_biotype) %>%
  dplyr::mutate(group = "kallisto")
kallisto_lengths$transcript_length %>% summary()
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    53.0   564.8   999.5  1601.3  2210.2 14269.0
# Summarise kmer lengths
kallisto_mer_hist$kmer_length %>% table() %>% as.data.frame()
##      . Freq
## 1    0    3
## 2    1    1
## 3    2    2
## 4    3   14
## 5    4  133
## 6    5  421
## 7    6  395
## 8    7  171
## 9    8   80
## 10   9   47
## 11  10   29
## 12  11   21
## 13  12   20
## 14  13   17
## 15  14   16
## 16  15   16
## 17  16   14
## 18  17   14
## 19  18   13
## 20  19   11
## 21  20   10
## 22  21   11
## 23  22   10
## 24  23   11
## 25  24    8
## 26  25   10
## 27  26    8
## 28  27    9
## 29  28    7
## 30  29    9
## 31  30    8
## 32  31    9
## 33  32    7
## 34  33    8
## 35  34    7
## 36  35    8
## 37  36    6
## 38  37    6
## 39  38    5
## 40  39    5
## 41  40    4
## 42  41    4
## 43  42    3
## 44  43    3
## 45  44    3
## 46  45    3
## 47  46    3
## 48  47    3
## 49  48    3
## 50  49    3
## 51  50    3
## 52  51    3
## 53  52    3
## 54  53    3
## 55  54    3
## 56  55    3
## 57  56    3
## 58  57    3
## 59  58    3
## 60  59    3
## 61  60    3
## 62  61    3
## 63  62    3
## 64  63    3
## 65  64    3
## 66  65    3
## 67  66    3
## 68  67    3
## 69  68    3
## 70  69    3
## 71  70    3
## 72  71    3
## 73  72    3
## 74  73    3
## 75  74    3
## 76  75    3
## 77  76    3
## 78  77    3
## 79  78    3
## 80  79    3
## 81  80    3
## 82  81    3
## 83  82    3
## 84  83    3
## 85  84    3
## 86  85    3
## 87  86    3
## 88  87    3
## 89  88    3
## 90  89    3
## 91  90    3
## 92  91    3
## 93  92    3
## 94  93    3
## 95  94    3
## 96  95    3
## 97  96    3
## 98  97    3
## 99  98    3
## 100 99    3
kallisto_mer_hist %>%
  dplyr::filter(kmer_length >= 31) %>%
  dplyr::mutate(Repeats = paste0("Repeat ", as.character(Repeats))) %>%
  ggplot(aes(x = kmer_length, y = Frequency, fill = as.character(Repeats))) +
  geom_area(fill = "royalblue",
            colour = "darkblue",
            linewidth = 1,
            alpha = 0.7) +
  facet_grid(~ as.character(Repeats),
             scales = "free") +
  labs(x = "K-mer Length (bp)",
       y = "Frequency",
       fill = "Kmer") +
  theme_bw() +
  theme(axis.text = element_text(colour = "black", size = 10),
        axis.title = element_text(colour = "black", size = 12),
        strip.text = element_text(colour = "black", size = 12),
        strip.background = element_rect(fill = "lightblue"))

kallisto_mer_hist %>%
  dplyr::filter(kmer_length <= 20) %>%
  dplyr::mutate(Repeats = paste0("Repeat ", as.character(Repeats))) %>%
  ggplot(aes(x = kmer_length, y = Frequency, fill = as.character(Repeats))) +
  geom_area(fill = "royalblue",
            colour = "darkblue",
            linewidth = 1,
            alpha = 0.7) +
  labs(x = "K-mer Length (bp)",
       y = "Frequency",
       fill = "Kmer") +
  theme_bw() +
  theme(axis.text = element_text(colour = "black", size = 10),
        axis.title = element_text(colour = "black", size = 12),
        strip.text = element_text(colour = "black", size = 12),
        strip.background = element_rect(fill = "lightblue"),
        legend.position = "none")

kallisto_mer_stat %>%
  dplyr::filter(!Type == "Max_count") %>%
  ggplot(aes(x = kmer_length, y = Frequency, fill = Type)) +
  geom_area(fill = "royalblue",
            colour = "darkblue",
            linewidth = 1,
            alpha = 0.7) +
  facet_grid(~ Type) +
  theme_bw()

Salmon

sal_tx_remove <- txp_gene_ensdb_lengths %>%
  dplyr::filter(tx_id %in% salmon_unique$transcript_id) %>%
  dplyr::filter(str_detect(seqnames, "^CHR")) %>%
  as.data.frame()

salmon_unique <- salmon_unique %>%
  dplyr::filter(!transcript_id %in% sal_tx_remove$tx_id)

yTx_salmon <- exonsBy(edb, filter = TxIdFilter(salmon_unique$transcript_id))

yTxSeqs_salmon <- extractTranscriptSeqs(dna, yTx_salmon)

salmon_kmer_df <- yTxSeqs_salmon %>%
  as.data.frame() %>%
  rownames_to_column("transcript_id") %>%
  mutate(transcript_id = paste0(">", transcript_id)) %>%
  set_colnames(c("transcript_id", "sequence")) %>%
  as_tibble()

# Call head to make sure that we get 200 transcripts for each group!
salmon_kmer_df %>%
  head(200) %>%
  write_delim(here("data/kmer_analysis/unique/salmon/salmon_seqs.txt"),
              delim = "\n",
              eol = "\n\n",
              col_names = FALSE)
#vol=/Volumes/Fast_T7/R_projects/MethodsChapter/data/sequence_analysis
loc=/Users/konstantinosbogias/Documents/PhD/R_Projects/MethodsChapter/data/kmer_analysis/unique

cp ${loc}/salmon/salmon_seqs.txt ${loc}/salmon/salmon_seqs.fa
for i in {1..100}; do
    jellyfish count -m ${i} -s 100M --output ${loc}/salmon/salmon_mer_counts.jf -C ${loc}/salmon/salmon_seqs.fa
    jellyfish histo ${loc}/salmon/salmon_mer_counts.jf > ${loc}/salmon/histos/salmon_${i}_mer_counts_histo.txt
    #jellyfish dump ${loc}/salmon/salmon_mer_counts.jf > ${vol}/salmon/dumps/salmon_${i}_mer_counts_dump.fa
    jellyfish stats ${loc}/salmon/salmon_mer_counts.jf > ${loc}/salmon/stats/salmon_${i}_mer_counts_stats.txt
    echo "salmon ${i}_mer done"
done

Kmer data

salmon_mer_hist <- data.frame()
for(file in list.files(here("data/kmer_analysis/unique/salmon/histos"))) {
  salmon_mer_add <- read_delim(
    paste0(here("data/kmer_analysis/unique/salmon/histos/"), file),
    col_names = FALSE,
    delim = " ") %>%
    set_colnames(c("Repeats", "Frequency")) %>%
    mutate(kmer_length = file %>%
             str_extract("...mer") %>%
             str_remove("_") %>% 
             str_remove("_") %>% 
             str_remove("mer") %>%
             as.numeric())
  
  salmon_mer_hist <- rbind(salmon_mer_hist, salmon_mer_add)
}

salmon_mer_hist <- salmon_mer_hist %>%
  mutate(group = "salmon")

write_csv(salmon_mer_hist,
          here("data/kmer_analysis/unique/salmon/salmon_mer_hist.csv"))

salmon_mer_stat <- data.frame()
for(file in list.files(here("data/kmer_analysis/unique/salmon/stats"))) {
  salmon_stat_add <- read_delim(
    paste0(here("data/kmer_analysis/unique/salmon/stats/"), file),
    col_names = FALSE,
    delim = " ") %>% 
    as.data.frame() %>%
    column_to_rownames("X1") %>%
    mutate_if(is.character, as.numeric) %>%
    mutate(Frequency = coalesce(X2, X3, X4, X5)) %>%
    rownames_to_column("Type") %>%
    dplyr::select("Type", "Frequency") %>%
    mutate(Type = str_remove(Type, ":")) %>% 
    dplyr::mutate(kmer_length = file %>%
             str_extract("...mer") %>%
             str_remove("_") %>% 
             str_remove("_") %>% 
             str_remove("mer") %>%
                    as.numeric())
  
  salmon_mer_stat <- rbind(salmon_mer_stat, salmon_stat_add)
}

salmon_mer_stat <- salmon_mer_stat %>%
  mutate(group = "salmon")

write_csv(salmon_mer_stat,
          here("data/kmer_analysis/unique/salmon/salmon_mer_stat.csv"))

Kmer plotting

Wanna try and find long genes with less distinct kmers, so low Distinct and Unique, with a high Total. Possibly a low Max count but a high Repeat number.

salmon_lengths <- txp_gene_ensdb_lengths %>%
  dplyr::filter(tx_id %in% salmon_unique$transcript_id) %>%
  dplyr::select(tx_id, transcript_length, seqnames, strand, tx_biotype) %>%
  dplyr::mutate(group = "salmon")
salmon_lengths$transcript_length %>% summary()
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    66.0   540.0   795.5  1424.0  1871.8 13748.0
# Summarise kmer lengths
salmon_mer_hist$kmer_length %>% table() %>% as.data.frame()
##      . Freq
## 1    0    2
## 2    1    1
## 3    2    2
## 4    3   22
## 5    4  134
## 6    5  404
## 7    6  352
## 8    7  160
## 9    8   79
## 10   9   53
## 11  10   45
## 12  11   38
## 13  12   36
## 14  13   34
## 15  14   30
## 16  15   28
## 17  16   25
## 18  17   24
## 19  18   22
## 20  19   21
## 21  20   21
## 22  21   20
## 23  22   19
## 24  23   17
## 25  24   15
## 26  25   15
## 27  26   15
## 28  27   12
## 29  28   11
## 30  29   11
## 31  30   10
## 32  31   10
## 33  32    8
## 34  33    8
## 35  34    7
## 36  35    7
## 37  36    6
## 38  37    6
## 39  38    6
## 40  39    6
## 41  40    5
## 42  41    4
## 43  42    4
## 44  43    4
## 45  44    3
## 46  45    3
## 47  46    3
## 48  47    3
## 49  48    3
## 50  49    3
## 51  50    3
## 52  51    3
## 53  52    3
## 54  53    3
## 55  54    3
## 56  55    2
## 57  56    2
## 58  57    2
## 59  58    2
## 60  59    2
## 61  60    2
## 62  61    2
## 63  62    2
## 64  63    2
## 65  64    2
## 66  65    2
## 67  66    2
## 68  67    2
## 69  68    2
## 70  69    2
## 71  70    2
## 72  71    2
## 73  72    2
## 74  73    2
## 75  74    2
## 76  75    2
## 77  76    2
## 78  77    2
## 79  78    2
## 80  79    2
## 81  80    2
## 82  81    2
## 83  82    2
## 84  83    2
## 85  84    2
## 86  85    2
## 87  86    2
## 88  87    2
## 89  88    2
## 90  89    2
## 91  90    2
## 92  91    2
## 93  92    2
## 94  93    2
## 95  94    2
## 96  95    2
## 97  96    2
## 98  97    2
## 99  98    2
## 100 99    2
salmon_mer_hist %>%
  dplyr::filter(kmer_length >= 31) %>%
  dplyr::mutate(Repeats = paste0("Repeat ", as.character(Repeats))) %>%
  ggplot(aes(x = kmer_length, y = Frequency, fill = as.character(Repeats))) +
  geom_area(fill = "royalblue",
            colour = "darkblue",
            linewidth = 1,
            alpha = 0.7) +
  facet_grid(~ as.character(Repeats),
             scales = "free") +
  labs(x = "K-mer Length (bp)",
       y = "Frequency",
       fill = "Kmer") +
  theme_bw() +
  theme(axis.text = element_text(colour = "black", size = 10),
        axis.title = element_text(colour = "black", size = 12),
        strip.text = element_text(colour = "black", size = 12),
        strip.background = element_rect(fill = "lightblue"))

salmon_mer_hist %>%
  dplyr::filter(kmer_length <= 20) %>%
  dplyr::mutate(Repeats = paste0("Repeat ", as.character(Repeats))) %>%
  ggplot(aes(x = kmer_length, y = Frequency, fill = as.character(Repeats))) +
  geom_area(fill = "royalblue",
            colour = "darkblue",
            linewidth = 1,
            alpha = 0.7) +
  labs(x = "K-mer Length (bp)",
       y = "Frequency",
       fill = "Kmer") +
  theme_bw() +
  theme(axis.text = element_text(colour = "black", size = 10),
        axis.title = element_text(colour = "black", size = 12),
        strip.text = element_text(colour = "black", size = 12),
        strip.background = element_rect(fill = "lightblue"),
        legend.position = "none")

salmon_mer_stat %>%
  dplyr::filter(!Type == "Max_count") %>%
  ggplot(aes(x = kmer_length, y = Frequency, fill = Type)) +
  geom_area(fill = "royalblue",
            colour = "darkblue",
            linewidth = 1,
            alpha = 0.7) +
  facet_grid(~ Type) +
  theme_bw()

STAR

star_unique <- star_unique %>%
  dplyr::filter(transcript_id %in% gene_txp_anno$transcript_id)

yTx_sa <- exonsBy(edb, filter = TxIdFilter(star_unique$transcript_id))

yTxSeqs_sa <- extractTranscriptSeqs(dna, yTx_sa)

star_kmer_df <- yTxSeqs_sa %>%
  as.data.frame() %>%
  rownames_to_column("transcript_id") %>%
  mutate(transcript_id = paste0(">", transcript_id)) %>%
  set_colnames(c("transcript_id", "sequence")) %>%
  as_tibble()

star_kmer_df %>%
  head(200) %>%
  write_delim(here("data/kmer_analysis/unique/star/star_seqs.txt"),
            delim = "\n",
            eol = "\n\n",
            col_names = FALSE)

# star_kmer_df %>%
#   head(20) %>%
#   write_delim(here("data/kmer_analysis/unique/star_test/star_seqs_test.txt"),
#               delim = "\n",
#               eol = "\n\n",
#               col_names = FALSE)
#vol=/Volumes/Fast_T7/R_projects/MethodsChapter/data/sequence_analysis
loc=/Users/konstantinosbogias/Documents/PhD/R_Projects/MethodsChapter/data/kmer_analysis/unique

cp ${loc}/star/star_seqs.txt ${loc}/star/star_seqs.fa
for i in {1..100}; do
    jellyfish count -m ${i} -s 100M --output ${loc}/star/star_mer_counts.jf -C ${loc}/star/star_seqs.fa
    jellyfish histo ${loc}/star/star_mer_counts.jf > ${loc}/star/histos/star_${i}_mer_counts_histo.txt
    #jellyfish dump ${loc}/star/star_mer_counts.jf > ${vol}/star/dumps/star_${i}_mer_counts_dump.fa
    jellyfish stats ${loc}/star/star_mer_counts.jf > ${loc}/star/stats/star_${i}_mer_counts_stats.txt
    echo "sa ${i}_mer done"
done

Kmer data

star_mer_hist <- data.frame()
for(file in list.files(here("data/kmer_analysis/unique/star/histos"))) {
  star_mer_add <- read_delim(
    paste0(here("data/kmer_analysis/unique/star/histos/"), file),
    col_names = FALSE,
    delim = " ") %>%
    set_colnames(c("Repeats", "Frequency")) %>%
    mutate(kmer_length = file %>%
             str_extract("...mer") %>%
             str_remove("_") %>% 
             str_remove("_") %>% 
             str_remove("mer") %>%
             as.numeric())
  
  star_mer_hist <- rbind(star_mer_hist, star_mer_add)
}

star_mer_hist <- star_mer_hist %>%
  mutate(group = "star")

write_csv(star_mer_hist,
          here("data/kmer_analysis/unique/star/star_mer_hist.csv"))

star_mer_stat <- data.frame()
for(file in list.files(here("data/kmer_analysis/unique/star/stats"))) {
  star_stat_add <- read_delim(
    paste0(here("data/kmer_analysis/unique/star/stats/"), file),
    col_names = FALSE,
    delim = " ") %>% 
    as.data.frame() %>%
    column_to_rownames("X1") %>%
    mutate_if(is.character, as.numeric) %>%
    mutate(Frequency = coalesce(X2, X3, X4, X5)) %>%
    rownames_to_column("Type") %>%
    dplyr::select("Type", "Frequency") %>%
    mutate(Type = str_remove(Type, ":")) %>% 
    dplyr::mutate(kmer_length = file %>%
             str_extract("...mer") %>%
             str_remove("_") %>% 
             str_remove("_") %>% 
             str_remove("mer") %>%
                    as.numeric())
  
  star_mer_stat <- rbind(star_mer_stat, star_stat_add)
}

star_mer_stat <- star_mer_stat %>%
  mutate(group = "star")

write_csv(star_mer_stat,
          here("data/kmer_analysis/unique/star/star_mer_stat.csv"))

Kmer plotting

Wanna try and find long genes with less distinct kmers, so low Distinct and Unique, with a high Total. Possibly a low Max count but a high Repeat number.

star_lengths <- txp_gene_ensdb_lengths %>%
  dplyr::filter(tx_id %in% star_unique$transcript_id) %>%
  dplyr::select(tx_id, transcript_length, seqnames, strand, tx_biotype) %>%
  dplyr::mutate(group = "star")
star_lengths$transcript_length %>% summary()
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##      81     506     732    1168    1508   11640
# Summarise kmer lengths
star_mer_hist$kmer_length %>% table() %>% as.data.frame()
##      . Freq
## 1    0    2
## 2    1    1
## 3    2    3
## 4    3   26
## 5    4  133
## 6    5  389
## 7    6  349
## 8    7  150
## 9    8   70
## 10   9   36
## 11  10   24
## 12  11   19
## 13  12   19
## 14  13   17
## 15  14   15
## 16  15   13
## 17  16   13
## 18  17   12
## 19  18    9
## 20  19    8
## 21  20    8
## 22  21    7
## 23  22    7
## 24  23    7
## 25  24    7
## 26  25    6
## 27  26    6
## 28  27    6
## 29  28    6
## 30  29    6
## 31  30    6
## 32  31    5
## 33  32    5
## 34  33    5
## 35  34    4
## 36  35    4
## 37  36    4
## 38  37    4
## 39  38    4
## 40  39    4
## 41  40    4
## 42  41    4
## 43  42    4
## 44  43    4
## 45  44    4
## 46  45    4
## 47  46    3
## 48  47    3
## 49  48    3
## 50  49    3
## 51  50    3
## 52  51    3
## 53  52    3
## 54  53    3
## 55  54    3
## 56  55    3
## 57  56    3
## 58  57    3
## 59  58    3
## 60  59    3
## 61  60    3
## 62  61    3
## 63  62    3
## 64  63    3
## 65  64    3
## 66  65    3
## 67  66    2
## 68  67    2
## 69  68    2
## 70  69    2
## 71  70    2
## 72  71    2
## 73  72    2
## 74  73    2
## 75  74    2
## 76  75    2
## 77  76    2
## 78  77    2
## 79  78    2
## 80  79    2
## 81  80    2
## 82  81    2
## 83  82    2
## 84  83    2
## 85  84    2
## 86  85    2
## 87  86    2
## 88  87    2
## 89  88    2
## 90  89    2
## 91  90    2
## 92  91    2
## 93  92    2
## 94  93    2
## 95  94    2
## 96  95    2
## 97  96    2
## 98  97    2
## 99  98    2
## 100 99    2
star_mer_hist %>%
  dplyr::filter(kmer_length >= 31) %>%
  ggplot(aes(x = kmer_length, y = Repeats)) + 
  geom_area(fill = "royalblue",
            colour = "darkblue",
            linewidth = 1,
            alpha = 0.7)

star_mer_hist %>%
  dplyr::filter(kmer_length >= 31) %>%
  dplyr::mutate(Repeats = paste0("Repeat ", as.character(Repeats))) %>%
  ggplot(aes(x = kmer_length, y = Frequency, fill = as.character(Repeats))) +
  geom_area(fill = "royalblue",
            colour = "darkblue",
            linewidth = 1,
            alpha = 0.7) +
  facet_grid(~ as.character(Repeats),
             scales = "free") +
  labs(x = "K-mer Length (bp)",
       y = "Frequency",
       fill = "Kmer") +
  theme_bw() +
  theme(axis.text = element_text(colour = "black", size = 10),
        axis.title = element_text(colour = "black", size = 12),
        strip.text = element_text(colour = "black", size = 12),
        strip.background = element_rect(fill = "lightblue"))

star_mer_hist %>%
  dplyr::filter(kmer_length <= 20) %>%
  dplyr::mutate(Repeats = paste0("Repeat ", as.character(Repeats))) %>%
  ggplot(aes(x = kmer_length, y = Frequency, fill = as.character(Repeats))) +
  geom_area(fill = "royalblue",
            colour = "darkblue",
            linewidth = 1,
            alpha = 0.7) +
  labs(x = "K-mer Length (bp)",
       y = "Frequency",
       fill = "Kmer") +
  theme_bw() +
  theme(axis.text = element_text(colour = "black", size = 10),
        axis.title = element_text(colour = "black", size = 12),
        strip.text = element_text(colour = "black", size = 12),
        strip.background = element_rect(fill = "lightblue"),
        legend.position = "none")

star_mer_stat %>%
  dplyr::filter(!Type == "Max_count") %>%
  ggplot(aes(x = kmer_length, y = Frequency, fill = Type)) +
  geom_area(fill = "royalblue",
            colour = "darkblue",
            linewidth = 1,
            alpha = 0.7) +
  facet_grid(~ Type) +
  theme_bw()

Plotting

figure 5

lengths_df <- rbind(highcor_lengths,
                    lowcor_lengths,
                    hisat2_lengths,
                    kallisto_lengths,
                    star_lengths,
                    salmon_lengths)

lengths_by_method <- lengths_df %>%
  dplyr::mutate(
    group = case_when(
      group == "highcor" ~ "Concordant",
      group == "kallisto" ~ "Kallisto",
      group == "star" ~ "STAR",
      group == "hisat2" ~ "HISAT2",
      group == "salmon" ~ "Salmon",
      .default = "Discordant"
    ),
    group = factor(group, levels = c("Concordant",
                                     "Discordant",
                                     "HISAT2",
                                     "Kallisto",
                                     "Salmon",
                                     "STAR"))) %>%
  ggplot(aes(x = group, y = transcript_length)) +
  geom_boxplot(fill = "lightblue") +
  scale_y_log10() +
  labs(x = "",
       y = "Transcript Length (log10)") +
  theme_bw()

lengths_by_method %>%
  ggsave(filename = "length_by_method_plot.png",
         path = here("figures/kmer_analysis/unique/all_methods_plots/"),
         device = "png",
         height = 200,
         width = 200,
         units = "mm",
         dpi = 400)

txp_lengths_hist <- txp_gene_ensdb_lengths %>%
  dplyr::select(tx_id, transcript_length) %>%
  mutate(group = "All Transcripts") %>%
  rbind(lengths_df %>% 
          dplyr::filter(group == "lowcor") %>%
          dplyr::select(tx_id, transcript_length) %>%
          dplyr::mutate(group = "Discordant Transcripts")) %>%
  ggplot(aes(x = log10(transcript_length))) +
  geom_histogram(bins = 200) +
  scale_x_continuous(limits = c(1, 6),
                     breaks = c(1, 2, 3, 4, 5, 6)) +
  labs(x = "Transcript Length (log10)",
       y = "Frequency") +
  facet_grid(group ~ ., switch = "y", scales = "free") +
  theme_bw() +
  theme(axis.text = element_text(colour = "black", size = 12),
        axis.title = element_text(colour = "black", size = 14),
        strip.background = element_rect(fill = "lightblue"),
        strip.text = element_text(colour = "black", size = 14))

txp_lengths_hist %>%
  ggsave(filename = "discor_length_plot.png",
         path = here("figures/kmer_analysis/unique/all_methods_plots/"),
         device = "png",
         height = 150,
         width = 200,
         units = "mm",
         dpi = 400)

# We can see that there is stuff all variation in frequency within groups
stat_df <- rbind(highcor_mer_stat,
                 lowcor_mer_stat,
                 hisat2_mer_stat,
                 kallisto_mer_stat,
                 star_mer_stat,
                 salmon_mer_stat)

stat_df <- stat_df %>%
  dplyr::mutate(kmer_length = case_when(
    kmer_length == 0 ~ 100,
    .default = kmer_length
  ))

group_names <- stat_df$group %>% unique()
new_stat_df <- data.frame()
stat_group_chunk <- data.frame()
for(g in group_names) {
  for(i in 1:100) {
    stat_kmer_chunk <- stat_df %>%
      dplyr::filter(kmer_length == i) %>%
      dplyr::filter(group == g) %>%
      dplyr::select(Type, Frequency) %>%
      distinct() %>%
      column_to_rownames("Type") %>%
      t() %>%
      as_tibble() %>%
      mutate(Multiplicity = Total-Distinct,
             Percent_Multi = Multiplicity/Total,
             Percent_Unique = Unique/Total) %>%
      t() %>%
      set_colnames("Frequency") %>%
      as.data.frame() %>%
      rownames_to_column("Type") %>%
      mutate(kmer_length = i, group = g)
    
    stat_group_chunk <- rbind(stat_group_chunk, stat_kmer_chunk)
  }
  new_stat_df <- rbind(new_stat_df, stat_group_chunk)
}


kmer_stat_plotting <- function(x, type, kmer = 31) {
  x %>%
    dplyr::filter(kmer_length >= kmer,
                  Type == type) %>%
    dplyr::mutate(
      group = case_when(
        group == "highcor" ~ "Concordant Transcripts",
        group == "lowcor" ~ "Discordant Transcripts",
        group == "hisat2" ~ "HISAT2",
        group == "kallisto" ~ "Kallisto",
        group == "star" ~ "STAR",
        group == "salmon" ~ "Salmon"
      ),
      group = factor(group, levels = c("Concordant Transcripts",
                                       "Discordant Transcripts",
                                       "HISAT2",
                                       "Kallisto",
                                       "STAR",
                                       "Salmon"))) %>%
    ggplot(aes(x = kmer_length, y = Frequency)) +
    geom_area(fill = "royalblue",
              colour = "darkblue",
              linewidth = 1,
              alpha = 0.7) +
    scale_y_log10() +
    labs(x = "K-mer Length (bp)",
         y = "Frequency") +
    facet_grid(~ group) +
    theme_bw() +
    theme(strip.background = element_rect(fill = "lightblue"))
}

type_nms <- new_stat_df$Type %>% unique()

for(i in type_nms) {
  kmer_stat_plotting(new_stat_df, type = i) 
  
  ggsave(filename = paste0(i, "_kmer_length_freq.png"),
         path = here("figures/kmer_analysis/unique/all_methods_plots_log10/"),
         device = "png",
         height = 150,
         width = 300,
         units = "mm",
         dpi = 400)
}


hist_df <- rbind(lowcor_mer_hist,
                 highcor_mer_hist,
                 hisat2_mer_hist,
                 kallisto_mer_hist,
                 star_mer_hist,
                 salmon_mer_hist)

hist_df %>%
  dplyr::filter(kmer_length == 31) %>%
  dplyr::arrange(Repeats) %>%
  ggplot(aes(x = Repeats, y = Frequency)) +
  geom_area(fill = "royalblue",
            colour = "darkblue",
            linewidth = 1,
            alpha = 0.7) +
  facet_grid(~ group) +
  theme_bw()

Lollipop plot for kmers

percent_lolpop <- new_stat_df %>%
  dplyr::filter(Type %in% c("Percent_Multi", "Percent_Unique")) %>%
  dplyr::mutate(Percent = Frequency*100) %>%
  dplyr::filter(kmer_length == 31) %>%
  dplyr::mutate(group = case_when(
    group == "highcor" ~ "Concordant",
    group == "lowcor" ~ "Discordant",
    group == "hisat2" ~ "HISAT2",
    group == "kallisto" ~ "Kallisto",
    group == "salmon" ~ "Salmon",
    group == "star" ~ "STAR"
  ),
  Type = case_when(Type == "Percent_Multi" ~ "K-mer Repeats (%)",
                   .default = "Unique K-mers (%)"),
  group = factor(group, levels = rev(c("Concordant", "Discordant", 
                                       "HISAT2", "Kallisto",
                                       "Salmon", "STAR")))) %>%
  ggplot(aes(x = Percent, y = group)) +
  geom_point() +
  geom_segment(aes(x = 0, xend = Percent, y = group, yend = group),
               colour = "skyblue", linewidth = 1) +
  geom_point(colour = "skyblue", size = 6) +
  geom_point(colour = "royalblue", size = 4) +
  geom_point(colour = "darkblue", size = 2) +
  labs(x = "Percentage (%)") +
  facet_wrap(~ Type) +
  theme_bw() +
  theme(axis.title = element_text(colour = "black", size = 12),
        axis.text = element_text(colour = "black", size = 10),
        strip.text = element_text(colour = "black", size = 11),
        strip.background = element_rect(fill = "lightblue"),
        axis.title.y = element_blank(),
        axis.ticks.y = element_blank())

percent_lolpop %>%
    ggsave(filename = "percent_lollipop.png",
         path = here("figures/kmer_analysis/unique/lollipops/"),
         device = "png",
         height = 150,
         width = 300,
         units = "mm",
         dpi = 400)
multi_lolpop <- new_stat_df %>%
  dplyr::filter(Type == "Multiplicity") %>%
  dplyr::mutate(Percent = Frequency*100) %>%
  dplyr::filter(kmer_length == 31) %>%
    dplyr::mutate(group = case_when(
                  group == "highcor" ~ "Concordant",
                  group == "lowcor" ~ "Discordant",
                  group == "hisat2" ~ "HISAT2",
                  group == "kallisto" ~ "Kallisto",
                  group == "salmon" ~ "Salmon",
                  group == "star" ~ "STAR"
                ),
                group = factor(group, levels = rev(c("Concordant", "Discordant", 
                                                 "HISAT2", "Kallisto",
                                                 "Salmon", "STAR")))) %>%
  ggplot(aes(x = Frequency, y = group)) +
    geom_segment(aes(x = 0, xend = Frequency, y = group, yend = group),
               colour = "skyblue", linewidth = 1) +
  geom_point(colour = "skyblue", size = 6) +
  geom_point(colour = "royalblue", size = 4) +
  geom_point(colour = "darkblue", size = 2) +
  labs(x = "K-mer Repeat Frequency") +
  scale_x_log10() +
  theme_bw() +
  theme(axis.title = element_text(colour = "black", size = 12),
        axis.text = element_text(colour = "black", size = 10),
        axis.text.y = element_text(face = "bold"),
        strip.text = element_text(colour = "black", size = 11),
        strip.background = element_rect(fill = "lightblue"),
        axis.title.y = element_blank(),
        axis.ticks.y = element_blank())

multi_lolpop %>%
    ggsave(filename = "multi_lollipop.png",
         path = here("figures/kmer_analysis/unique/lollipops/"),
         device = "png",
         height = 200,
         width = 250,
         units = "mm",
         dpi = 400)
uniq_dis_lolpop <- new_stat_df %>%
  dplyr::filter(kmer_length == 31 & Type %in% c("Unique",
                                                "Distinct",
                                                "Total")) %>%
  dplyr::mutate(group = case_when(
                  group == "highcor" ~ "Concordant",
                  group == "lowcor" ~ "Discordant",
                  group == "hisat2" ~ "HISAT2",
                  group == "kallisto" ~ "Kallisto",
                  group == "salmon" ~ "Salmon",
                  group == "star" ~ "STAR"
                ),
                group = factor(group, levels = rev(c("Concordant", "Discordant", 
                                                 "HISAT2", "Kallisto",
                                                 "Salmon", "STAR"))),
                Type = factor(Type,
                              levels = c("Unique", "Distinct", "Total"))) %>%
  distinct() %>%
  ggplot(aes(x = Frequency, y = group)) +
  geom_segment(aes(x = 0, xend = Frequency,
                   y = group, yend = group),
               colour = "skyblue", linewidth = 1) +
  geom_point(colour = "skyblue", size = 6) +
  geom_point(colour = "royalblue", size = 4) +
  geom_point(colour = "darkblue", size = 2) +
  scale_x_log10() +
  scale_x_continuous(limits = c(0, 8e5)) +
  facet_wrap(~ Type, scales = "fixed") +
  theme_bw() +
  theme(axis.title = element_text(colour = "black", size = 12),
        axis.text = element_text(colour = "black", size = 10),
        strip.text = element_text(colour = "black", size = 11),
        strip.background = element_rect(fill = "lightblue"),
        axis.title.y = element_blank(),
        axis.ticks.y = element_blank())

uniq_dis_lolpop %>%
    ggsave(filename = "uniq_dis_lollipop.png",
         path = here("figures/kmer_analysis/unique/lollipops/"),
         device = "png",
         height = 200,
         width = 500,
         units = "mm",
         dpi = 400)

kmer_numbers

new_stat_df %>%
  distinct() %>%
  dplyr::filter(Type == "Unique", kmer_length > 30) %>%
  dplyr:: filter(group == "highcor") %>%
  dplyr::select("Frequency") %>%
  summary()
##    Frequency     
##  Min.   :738599  
##  1st Qu.:741960  
##  Median :745325  
##  Mean   :745189  
##  3rd Qu.:748582  
##  Max.   :750831
new_stat_df %>%
  distinct() %>%
  dplyr::filter(Type == "Unique", kmer_length > 30) %>%
  dplyr:: filter(group == "lowcor") %>%
  dplyr::select("Frequency") %>%
  summary()
##    Frequency     
##  Min.   :494126  
##  1st Qu.:496865  
##  Median :499443  
##  Mean   :499336  
##  3rd Qu.:501892  
##  Max.   :504011
new_stat_df %>%
  distinct() %>%
  dplyr::filter(Type == "Unique", kmer_length > 30) %>%
  dplyr:: filter(group == "hisat2") %>%
  dplyr::select("Frequency") %>%
  summary()
##    Frequency     
##  Min.   :480268  
##  1st Qu.:481822  
##  Median :483368  
##  Mean   :483303  
##  3rd Qu.:484817  
##  Max.   :486058
new_stat_df %>%
  distinct() %>%
  dplyr::filter(Type == "Unique", kmer_length > 30) %>%
  dplyr:: filter(group == "kallisto") %>%
  dplyr::select("Frequency") %>%
  summary()
##    Frequency     
##  Min.   :282407  
##  1st Qu.:285575  
##  Median :288716  
##  Mean   :288627  
##  3rd Qu.:291755  
##  Max.   :294368
new_stat_df %>%
  distinct() %>%
  dplyr::filter(Type == "Unique", kmer_length > 30) %>%
  dplyr:: filter(group == "salmon") %>%
  dplyr::select("Frequency") %>%
  summary()
##    Frequency     
##  Min.   :238655  
##  1st Qu.:241830  
##  Median :245008  
##  Mean   :244930  
##  3rd Qu.:248152  
##  Max.   :250567
new_stat_df %>%
  distinct() %>%
  dplyr::filter(Type == "Unique", kmer_length > 30) %>%
  dplyr:: filter(group == "star") %>%
  dplyr::select("Frequency") %>%
  summary()
##    Frequency     
##  Min.   :230823  
##  1st Qu.:233704  
##  Median :236542  
##  Mean   :236315  
##  3rd Qu.:239023  
##  Max.   :240967
new_stat_df %>%
  distinct() %>%
  dplyr::filter(Type == "Multiplicity", kmer_length > 30) %>%
  dplyr:: filter(group == "highcor") %>%
  dplyr::select("Frequency") %>%
  summary()
##    Frequency     
##  Min.   :  58.0  
##  1st Qu.: 109.8  
##  Median : 161.5  
##  Mean   : 259.6  
##  3rd Qu.: 308.2  
##  Max.   :1106.0
new_stat_df %>%
  distinct() %>%
  dplyr::filter(Type == "Multiplicity", kmer_length > 30) %>%
  dplyr:: filter(group == "lowcor") %>%
  dplyr::select("Frequency") %>%
  summary()
##    Frequency    
##  Min.   :24463  
##  1st Qu.:24982  
##  Median :25598  
##  Mean   :25684  
##  3rd Qu.:26319  
##  Max.   :27337
new_stat_df %>%
  distinct() %>%
  dplyr::filter(Type == "Multiplicity", kmer_length > 30) %>%
  dplyr:: filter(group == "hisat2") %>%
  dplyr::select("Frequency") %>%
  summary()
##    Frequency     
##  Min.   :162635  
##  1st Qu.:163910  
##  Median :165190  
##  Mean   :165226  
##  3rd Qu.:166523  
##  Max.   :167974
new_stat_df %>%
  distinct() %>%
  dplyr::filter(Type == "Multiplicity", kmer_length > 30) %>%
  dplyr:: filter(group == "kallisto") %>%
  dplyr::select("Frequency") %>%
  summary()
##    Frequency   
##  Min.   :4686  
##  1st Qu.:4824  
##  Median :4978  
##  Mean   :5028  
##  3rd Qu.:5188  
##  Max.   :5652
new_stat_df %>%
  distinct() %>%
  dplyr::filter(Type == "Multiplicity", kmer_length > 30) %>%
  dplyr:: filter(group == "salmon") %>%
  dplyr::select("Frequency") %>%
  summary()
##    Frequency    
##  Min.   : 9374  
##  1st Qu.: 9491  
##  Median : 9627  
##  Mean   : 9685  
##  3rd Qu.: 9789  
##  Max.   :10491
new_stat_df %>%
  distinct() %>%
  dplyr::filter(Type == "Multiplicity", kmer_length > 30) %>%
  dplyr:: filter(group == "star") %>%
  dplyr::select("Frequency") %>%
  summary()
##    Frequency   
##  Min.   :3599  
##  1st Qu.:3875  
##  Median :4181  
##  Mean   :4306  
##  3rd Qu.:4678  
##  Max.   :5494